diff --git a/Package.swift b/Package.swift index 1a0bcb67..11134382 100644 --- a/Package.swift +++ b/Package.swift @@ -20,7 +20,7 @@ // Sources/CCryptoBoringSSL directory. The source repository is at // https://boringssl.googlesource.com/boringssl. // -// BoringSSL Commit: dbad745811195c00b729efd0ee0a09b7d9fce1d2 +// BoringSSL Commit: 6a2ccdcc2ed1d37a43a2183658d2ae61fd5ce208 import PackageDescription diff --git a/Sources/CCryptoBoringSSL/CMakeLists.txt b/Sources/CCryptoBoringSSL/CMakeLists.txt index cfbb1ef3..8a7ff4c8 100644 --- a/Sources/CCryptoBoringSSL/CMakeLists.txt +++ b/Sources/CCryptoBoringSSL/CMakeLists.txt @@ -91,6 +91,7 @@ add_library(CCryptoBoringSSL STATIC "crypto/dh_extra/dh_asn1.c" "crypto/dh_extra/params.c" "crypto/digest_extra/digest_extra.c" + "crypto/dilithium/dilithium.c" "crypto/dsa/dsa.c" "crypto/dsa/dsa_asn1.c" "crypto/ec_extra/ec_asn1.c" @@ -100,10 +101,11 @@ add_library(CCryptoBoringSSL STATIC "crypto/ecdsa_extra/ecdsa_asn1.c" "crypto/engine/engine.c" "crypto/err/err.c" - "crypto/err/err_data.c" "crypto/evp/evp.c" "crypto/evp/evp_asn1.c" "crypto/evp/evp_ctx.c" + "crypto/evp/p_dh.c" + "crypto/evp/p_dh_asn1.c" "crypto/evp/p_dsa_asn1.c" "crypto/evp/p_ec.c" "crypto/evp/p_ec_asn1.c" @@ -119,89 +121,17 @@ add_library(CCryptoBoringSSL STATIC "crypto/evp/scrypt.c" "crypto/evp/sign.c" "crypto/ex_data.c" - "crypto/fipsmodule/aes/aes.c" - "crypto/fipsmodule/aes/aes_nohw.c" - "crypto/fipsmodule/aes/key_wrap.c" - "crypto/fipsmodule/aes/mode_wrappers.c" - "crypto/fipsmodule/bn/add.c" - "crypto/fipsmodule/bn/asm/x86_64-gcc.c" - "crypto/fipsmodule/bn/bn.c" - "crypto/fipsmodule/bn/bytes.c" - "crypto/fipsmodule/bn/cmp.c" - "crypto/fipsmodule/bn/ctx.c" - "crypto/fipsmodule/bn/div.c" - "crypto/fipsmodule/bn/div_extra.c" - "crypto/fipsmodule/bn/exponentiation.c" - "crypto/fipsmodule/bn/gcd.c" - "crypto/fipsmodule/bn/gcd_extra.c" - "crypto/fipsmodule/bn/generic.c" - "crypto/fipsmodule/bn/jacobi.c" - "crypto/fipsmodule/bn/montgomery.c" - "crypto/fipsmodule/bn/montgomery_inv.c" - "crypto/fipsmodule/bn/mul.c" - "crypto/fipsmodule/bn/prime.c" - "crypto/fipsmodule/bn/random.c" - "crypto/fipsmodule/bn/rsaz_exp.c" - "crypto/fipsmodule/bn/shift.c" - "crypto/fipsmodule/bn/sqrt.c" - "crypto/fipsmodule/cipher/aead.c" - "crypto/fipsmodule/cipher/cipher.c" - "crypto/fipsmodule/cipher/e_aes.c" - "crypto/fipsmodule/cipher/e_aesccm.c" - "crypto/fipsmodule/cmac/cmac.c" - "crypto/fipsmodule/dh/check.c" - "crypto/fipsmodule/dh/dh.c" - "crypto/fipsmodule/digest/digest.c" - "crypto/fipsmodule/digest/digests.c" - "crypto/fipsmodule/digestsign/digestsign.c" - "crypto/fipsmodule/ec/ec.c" - "crypto/fipsmodule/ec/ec_key.c" - "crypto/fipsmodule/ec/ec_montgomery.c" - "crypto/fipsmodule/ec/felem.c" - "crypto/fipsmodule/ec/oct.c" - "crypto/fipsmodule/ec/p224-64.c" - "crypto/fipsmodule/ec/p256-nistz.c" - "crypto/fipsmodule/ec/p256.c" - "crypto/fipsmodule/ec/scalar.c" - "crypto/fipsmodule/ec/simple.c" - "crypto/fipsmodule/ec/simple_mul.c" - "crypto/fipsmodule/ec/util.c" - "crypto/fipsmodule/ec/wnaf.c" - "crypto/fipsmodule/ecdh/ecdh.c" - "crypto/fipsmodule/ecdsa/ecdsa.c" + "crypto/fipsmodule/bcm.c" "crypto/fipsmodule/fips_shared_support.c" - "crypto/fipsmodule/hkdf/hkdf.c" - "crypto/fipsmodule/hmac/hmac.c" - "crypto/fipsmodule/md4/md4.c" - "crypto/fipsmodule/md5/md5.c" - "crypto/fipsmodule/modes/cbc.c" - "crypto/fipsmodule/modes/cfb.c" - "crypto/fipsmodule/modes/ctr.c" - "crypto/fipsmodule/modes/gcm.c" - "crypto/fipsmodule/modes/gcm_nohw.c" - "crypto/fipsmodule/modes/ofb.c" - "crypto/fipsmodule/modes/polyval.c" - "crypto/fipsmodule/rand/ctrdrbg.c" - "crypto/fipsmodule/rand/fork_detect.c" - "crypto/fipsmodule/rand/rand.c" - "crypto/fipsmodule/rand/urandom.c" - "crypto/fipsmodule/rsa/blinding.c" - "crypto/fipsmodule/rsa/padding.c" - "crypto/fipsmodule/rsa/rsa.c" - "crypto/fipsmodule/rsa/rsa_impl.c" - "crypto/fipsmodule/self_check/fips.c" - "crypto/fipsmodule/self_check/self_check.c" - "crypto/fipsmodule/service_indicator/service_indicator.c" - "crypto/fipsmodule/sha/sha1.c" - "crypto/fipsmodule/sha/sha256.c" - "crypto/fipsmodule/sha/sha512.c" - "crypto/fipsmodule/tls/kdf.c" "crypto/hpke/hpke.c" "crypto/hrss/hrss.c" "crypto/keccak/keccak.c" "crypto/kyber/kyber.c" "crypto/lhash/lhash.c" + "crypto/md4/md4.c" + "crypto/md5/md5.c" "crypto/mem.c" + "crypto/mldsa/mldsa.c" "crypto/obj/obj.c" "crypto/obj/obj_xref.c" "crypto/pem/pem_all.c" @@ -222,26 +152,29 @@ add_library(CCryptoBoringSSL STATIC "crypto/poly1305/poly1305_vec.c" "crypto/pool/pool.c" "crypto/rand_extra/deterministic.c" + "crypto/rand_extra/fork_detect.c" "crypto/rand_extra/forkunsafe.c" "crypto/rand_extra/getentropy.c" "crypto/rand_extra/ios.c" "crypto/rand_extra/passive.c" "crypto/rand_extra/rand_extra.c" "crypto/rand_extra/trusty.c" + "crypto/rand_extra/urandom.c" "crypto/rand_extra/windows.c" "crypto/rc4/rc4.c" "crypto/refcount.c" "crypto/rsa_extra/rsa_asn1.c" "crypto/rsa_extra/rsa_crypt.c" "crypto/rsa_extra/rsa_print.c" + "crypto/sha/sha1.c" "crypto/siphash/siphash.c" - "crypto/spx/address.c" - "crypto/spx/fors.c" - "crypto/spx/merkle.c" "crypto/spx/spx.c" + "crypto/spx/spx_address.c" + "crypto/spx/spx_fors.c" + "crypto/spx/spx_merkle.c" + "crypto/spx/spx_thash.c" "crypto/spx/spx_util.c" - "crypto/spx/thash.c" - "crypto/spx/wots.c" + "crypto/spx/spx_wots.c" "crypto/stack/stack.c" "crypto/thread.c" "crypto/thread_none.c" @@ -319,80 +252,21 @@ add_library(CCryptoBoringSSL STATIC "crypto/x509/x_spki.c" "crypto/x509/x_val.c" "crypto/x509/x_x509.c" - "crypto/x509/x_x509a.c") + "crypto/x509/x_x509a.c" + "gen/crypto/err_data.c") if(CMAKE_SYSTEM_NAME STREQUAL Darwin AND CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64") target_sources(CCryptoBoringSSL PRIVATE - crypto/chacha/chacha-x86_64-mac.mac.x86_64.S - crypto/cipher_extra/aes128gcmsiv-x86_64-mac.mac.x86_64.S - crypto/cipher_extra/chacha20_poly1305_x86_64-mac.mac.x86_64.S - crypto/fipsmodule/aesni-gcm-x86_64-mac.mac.x86_64.S - crypto/fipsmodule/aesni-x86_64-mac.mac.x86_64.S - crypto/fipsmodule/ghash-ssse3-x86_64-mac.mac.x86_64.S - crypto/fipsmodule/ghash-x86_64-mac.mac.x86_64.S - crypto/fipsmodule/md5-x86_64-mac.mac.x86_64.S - crypto/fipsmodule/p256-x86_64-asm-mac.mac.x86_64.S - crypto/fipsmodule/p256_beeu-x86_64-asm-mac.mac.x86_64.S - crypto/fipsmodule/rdrand-x86_64-mac.mac.x86_64.S - crypto/fipsmodule/rsaz-avx2-mac.mac.x86_64.S - crypto/fipsmodule/sha1-x86_64-mac.mac.x86_64.S - crypto/fipsmodule/sha256-x86_64-mac.mac.x86_64.S - crypto/fipsmodule/sha512-x86_64-mac.mac.x86_64.S - crypto/fipsmodule/vpaes-x86_64-mac.mac.x86_64.S - crypto/fipsmodule/x86_64-mont-mac.mac.x86_64.S - crypto/fipsmodule/x86_64-mont5-mac.mac.x86_64.S) +) elseif(CMAKE_SYSTEM_NAME MATCHES "Linux|Android" AND CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64") target_sources(CCryptoBoringSSL PRIVATE - crypto/chacha/chacha-x86_64-linux.linux.x86_64.S - crypto/cipher_extra/aes128gcmsiv-x86_64-linux.linux.x86_64.S - crypto/cipher_extra/chacha20_poly1305_x86_64-linux.linux.x86_64.S - crypto/fipsmodule/aesni-gcm-x86_64-linux.linux.x86_64.S - crypto/fipsmodule/aesni-x86_64-linux.linux.x86_64.S - crypto/fipsmodule/ghash-ssse3-x86_64-linux.linux.x86_64.S - crypto/fipsmodule/ghash-x86_64-linux.linux.x86_64.S - crypto/fipsmodule/md5-x86_64-linux.linux.x86_64.S - crypto/fipsmodule/p256-x86_64-asm-linux.linux.x86_64.S - crypto/fipsmodule/p256_beeu-x86_64-asm-linux.linux.x86_64.S - crypto/fipsmodule/rdrand-x86_64-linux.linux.x86_64.S - crypto/fipsmodule/rsaz-avx2-linux.linux.x86_64.S - crypto/fipsmodule/sha1-x86_64-linux.linux.x86_64.S - crypto/fipsmodule/sha256-x86_64-linux.linux.x86_64.S - crypto/fipsmodule/sha512-x86_64-linux.linux.x86_64.S - crypto/fipsmodule/vpaes-x86_64-linux.linux.x86_64.S - crypto/fipsmodule/x86_64-mont-linux.linux.x86_64.S - crypto/fipsmodule/x86_64-mont5-linux.linux.x86_64.S) +) elseif(CMAKE_SYSTEM_NAME STREQUAL Darwin AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") target_sources(CCryptoBoringSSL PRIVATE - crypto/chacha/chacha-armv8-ios.ios.aarch64.S - crypto/cipher_extra/chacha20_poly1305_armv8-ios.ios.aarch64.S - crypto/fipsmodule/aesv8-armv8-ios.ios.aarch64.S - crypto/fipsmodule/aesv8-gcm-armv8-ios.ios.aarch64.S - crypto/fipsmodule/armv8-mont-ios.ios.aarch64.S - crypto/fipsmodule/bn-armv8-ios.ios.aarch64.S - crypto/fipsmodule/ghash-neon-armv8-ios.ios.aarch64.S - crypto/fipsmodule/ghashv8-armv8-ios.ios.aarch64.S - crypto/fipsmodule/p256-armv8-asm-ios.ios.aarch64.S - crypto/fipsmodule/p256_beeu-armv8-asm-ios.ios.aarch64.S - crypto/fipsmodule/sha1-armv8-ios.ios.aarch64.S - crypto/fipsmodule/sha256-armv8-ios.ios.aarch64.S - crypto/fipsmodule/sha512-armv8-ios.ios.aarch64.S - crypto/fipsmodule/vpaes-armv8-ios.ios.aarch64.S) +) elseif(CMAKE_SYSTEM_NAME MATCHES "Linux|Android" AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") target_sources(CCryptoBoringSSL PRIVATE - crypto/chacha/chacha-armv8-linux.linux.aarch64.S - crypto/cipher_extra/chacha20_poly1305_armv8-linux.linux.aarch64.S - crypto/fipsmodule/aesv8-armv8-linux.linux.aarch64.S - crypto/fipsmodule/aesv8-gcm-armv8-linux.linux.aarch64.S - crypto/fipsmodule/armv8-mont-linux.linux.aarch64.S - crypto/fipsmodule/bn-armv8-linux.linux.aarch64.S - crypto/fipsmodule/ghash-neon-armv8-linux.linux.aarch64.S - crypto/fipsmodule/ghashv8-armv8-linux.linux.aarch64.S - crypto/fipsmodule/p256-armv8-asm-linux.linux.aarch64.S - crypto/fipsmodule/p256_beeu-armv8-asm-linux.linux.aarch64.S - crypto/fipsmodule/sha1-armv8-linux.linux.aarch64.S - crypto/fipsmodule/sha256-armv8-linux.linux.aarch64.S - crypto/fipsmodule/sha512-armv8-linux.linux.aarch64.S - crypto/fipsmodule/vpaes-armv8-linux.linux.aarch64.S) +) endif() target_include_directories(CCryptoBoringSSL PUBLIC diff --git a/Sources/CCryptoBoringSSL/crypto/base64/base64.c b/Sources/CCryptoBoringSSL/crypto/base64/base64.c index b934e919..34e23e6b 100644 --- a/Sources/CCryptoBoringSSL/crypto/base64/base64.c +++ b/Sources/CCryptoBoringSSL/crypto/base64/base64.c @@ -307,6 +307,10 @@ static int base64_decode_quad(uint8_t *out, size_t *out_num_bytes, (in[2] == '=') << 1 | (in[3] == '='); + // In presence of padding, the lowest bits of v are unused. Canonical encoding + // (RFC 4648, section 3.5) requires that these bits all be set to zero. Common + // PEM parsers accept noncanonical base64, adding to the malleability of the + // format. This decoder follows OpenSSL's and Go's PEM parsers and accepts it. switch (padding_pattern) { case 0: // The common case of no padding. diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.h b/Sources/CCryptoBoringSSL/crypto/bcm_support.h similarity index 55% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.h rename to Sources/CCryptoBoringSSL/crypto/bcm_support.h index 0b7112a0..9556c17f 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.h +++ b/Sources/CCryptoBoringSSL/crypto/bcm_support.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2020, Google Inc. +/* Copyright (c) 2024, Google Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -12,11 +12,17 @@ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#ifndef OPENSSL_HEADER_CRYPTO_FORK_DETECT_H -#define OPENSSL_HEADER_CRYPTO_FORK_DETECT_H +#ifndef OPENSSL_HEADER_CRYPTO_BCM_SUPPORT_H +#define OPENSSL_HEADER_CRYPTO_BCM_SUPPORT_H #include +// Provided by libcrypto, called from BCM + +#if defined(__cplusplus) +extern "C" { +#endif + #if defined(OPENSSL_LINUX) // On linux we use MADVISE instead of pthread_atfork(), due // to concerns about clone() being used for address space @@ -29,15 +35,54 @@ // iOS doesn't normally allow fork in apps, but it's there. #define OPENSSL_FORK_DETECTION #define OPENSSL_FORK_DETECTION_PTHREAD_ATFORK -#elif defined(OPENSSL_WINDOWS) || defined(OPENSSL_TRUSTY) +#elif defined(OPENSSL_WINDOWS) || defined(OPENSSL_TRUSTY) || \ + defined(__ZEPHYR__) || defined(CROS_EC) // These platforms do not fork. #define OPENSSL_DOES_NOT_FORK #endif -#if defined(__cplusplus) -extern "C" { +#if defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE) +#define OPENSSL_RAND_DETERMINISTIC +#elif defined(OPENSSL_TRUSTY) +#define OPENSSL_RAND_TRUSTY +#elif defined(OPENSSL_WINDOWS) +#define OPENSSL_RAND_WINDOWS +#elif defined(OPENSSL_LINUX) +#define OPENSSL_RAND_URANDOM +#elif defined(OPENSSL_APPLE) && !defined(OPENSSL_MACOS) +// Unlike macOS, iOS and similar hide away getentropy(). +#define OPENSSL_RAND_IOS +#else +// By default if you are integrating BoringSSL we expect you to +// provide getentropy from the header file. +#define OPENSSL_RAND_GETENTROPY #endif +// Provided by libcrypto, called from BCM + +// CRYPTO_init_sysrand initializes long-lived resources needed to draw entropy +// from the operating system, if the operating system requires initialization. +void CRYPTO_init_sysrand(void); + +// CRYPTO_sysrand fills |len| bytes at |buf| with entropy from the operating +// system. +void CRYPTO_sysrand(uint8_t *buf, size_t len); + +// CRYPTO_sysrand_if_available fills |len| bytes at |buf| with entropy from the +// operating system, or early /dev/urandom data, and returns 1, _if_ the entropy +// pool is initialized or if getrandom() is not available and not in FIPS mode. +// Otherwise it will not block and will instead fill |buf| with all zeros and +// return 0. +int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len); + +// CRYPTO_sysrand_for_seed fills |len| bytes at |buf| with entropy from the +// operating system. It may draw from the |GRND_RANDOM| pool on Android, +// depending on the vendor's configuration. +void CRYPTO_sysrand_for_seed(uint8_t *buf, size_t len); + +// RAND_need_entropy is called whenever the BCM module has stopped because it +// has run out of entropy. +void RAND_need_entropy(size_t bytes_needed); // crypto_get_fork_generation returns the fork generation number for the current // process, or zero if not supported on the platform. The fork generation number @@ -60,8 +105,9 @@ OPENSSL_EXPORT uint64_t CRYPTO_get_fork_generation(void); OPENSSL_EXPORT void CRYPTO_fork_detect_force_madv_wipeonfork_for_testing( int on); + #if defined(__cplusplus) } // extern C #endif -#endif // OPENSSL_HEADER_CRYPTO_FORK_DETECT_H +#endif // OPENSSL_HEADER_CRYPTO_BCM_SUPPORT_H diff --git a/Sources/CCryptoBoringSSL/crypto/bio/bio.c b/Sources/CCryptoBoringSSL/crypto/bio/bio.c index 77181653..0de106b9 100644 --- a/Sources/CCryptoBoringSSL/crypto/bio/bio.c +++ b/Sources/CCryptoBoringSSL/crypto/bio/bio.c @@ -658,38 +658,38 @@ void BIO_meth_free(BIO_METHOD *method) { } int BIO_meth_set_create(BIO_METHOD *method, - int (*create)(BIO *)) { - method->create = create; + int (*create_func)(BIO *)) { + method->create = create_func; return 1; } int BIO_meth_set_destroy(BIO_METHOD *method, - int (*destroy)(BIO *)) { - method->destroy = destroy; + int (*destroy_func)(BIO *)) { + method->destroy = destroy_func; return 1; } int BIO_meth_set_write(BIO_METHOD *method, - int (*write)(BIO *, const char *, int)) { - method->bwrite = write; + int (*write_func)(BIO *, const char *, int)) { + method->bwrite = write_func; return 1; } int BIO_meth_set_read(BIO_METHOD *method, - int (*read)(BIO *, char *, int)) { - method->bread = read; + int (*read_func)(BIO *, char *, int)) { + method->bread = read_func; return 1; } int BIO_meth_set_gets(BIO_METHOD *method, - int (*gets)(BIO *, char *, int)) { - method->bgets = gets; + int (*gets_func)(BIO *, char *, int)) { + method->bgets = gets_func; return 1; } int BIO_meth_set_ctrl(BIO_METHOD *method, - long (*ctrl)(BIO *, int, long, void *)) { - method->ctrl = ctrl; + long (*ctrl_func)(BIO *, int, long, void *)) { + method->ctrl = ctrl_func; return 1; } diff --git a/Sources/CCryptoBoringSSL/crypto/bytestring/cbs.c b/Sources/CCryptoBoringSSL/crypto/bytestring/cbs.c index ee5a41b2..625c6a06 100644 --- a/Sources/CCryptoBoringSSL/crypto/bytestring/cbs.c +++ b/Sources/CCryptoBoringSSL/crypto/bytestring/cbs.c @@ -507,11 +507,9 @@ int CBS_get_asn1_int64(CBS *cbs, int64_t *out) { return 0; } uint8_t sign_extend[sizeof(int64_t)]; - memset(sign_extend, is_negative ? 0xff : 0, sizeof(sign_extend)); - for (size_t i = 0; i < len; i++) { - sign_extend[i] = data[len - i - 1]; - } - memcpy(out, sign_extend, sizeof(sign_extend)); + OPENSSL_memset(sign_extend, is_negative ? 0xff : 0, sizeof(sign_extend)); + OPENSSL_memcpy(sign_extend + sizeof(int64_t) - len, data, len); + *out = CRYPTO_load_u64_be(sign_extend); return 1; } diff --git a/Sources/CCryptoBoringSSL/crypto/bytestring/unicode.c b/Sources/CCryptoBoringSSL/crypto/bytestring/unicode.c index 00e603c1..4f990aef 100644 --- a/Sources/CCryptoBoringSSL/crypto/bytestring/unicode.c +++ b/Sources/CCryptoBoringSSL/crypto/bytestring/unicode.c @@ -18,11 +18,12 @@ static int is_valid_code_point(uint32_t v) { - // References in the following are to Unicode 9.0.0. + // References in the following are to Unicode 15.0.0. if (// The Unicode space runs from zero to 0x10ffff (3.4 D9). v > 0x10ffff || // Values 0x...fffe, 0x...ffff, and 0xfdd0-0xfdef are permanently reserved - // (3.4 D14) + // as noncharacters (3.4 D14). See also 23.7. As our APIs are intended for + // "open interchange", such as ASN.1, we reject them. (v & 0xfffe) == 0xfffe || (v >= 0xfdd0 && v <= 0xfdef) || // Surrogate code points are invalid (3.2 C1). diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-ios.ios.arm.S deleted file mode 100644 index 39e6890e..00000000 --- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-ios.ios.arm.S +++ /dev/null @@ -1,1462 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__APPLE__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__) -#include - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. - - -.text -#if defined(__thumb2__) || defined(__clang__) -.syntax unified -#endif -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif - -#if defined(__thumb2__) || defined(__clang__) -#define ldrhsb ldrbhs -#endif - -.align 5 -Lsigma: -.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral -Lone: -.long 1,0,0,0 - -.globl _ChaCha20_ctr32_nohw -.private_extern _ChaCha20_ctr32_nohw -#ifdef __thumb2__ -.thumb_func _ChaCha20_ctr32_nohw -#endif -.align 5 -_ChaCha20_ctr32_nohw: - ldr r12,[sp,#0] @ pull pointer to counter and nonce - stmdb sp!,{r0,r1,r2,r4-r11,lr} - adr r14,Lsigma - ldmia r12,{r4,r5,r6,r7} @ load counter and nonce - sub sp,sp,#4*(16) @ off-load area - stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce - ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key - ldmia r14,{r0,r1,r2,r3} @ load sigma - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key - stmdb sp!,{r0,r1,r2,r3} @ copy sigma - str r10,[sp,#4*(16+10)] @ off-load "rx" - str r11,[sp,#4*(16+11)] @ off-load "rx" - b Loop_outer_enter - -.align 4 -Loop_outer: - ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material - str r11,[sp,#4*(32+2)] @ save len - str r12, [sp,#4*(32+1)] @ save inp - str r14, [sp,#4*(32+0)] @ save out -Loop_outer_enter: - ldr r11, [sp,#4*(15)] - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - ldr r10, [sp,#4*(13)] - ldr r14,[sp,#4*(14)] - str r11, [sp,#4*(16+15)] - mov r11,#10 - b Loop - -.align 4 -Loop: - subs r11,r11,#1 - add r0,r0,r4 - mov r12,r12,ror#16 - add r1,r1,r5 - mov r10,r10,ror#16 - eor r12,r12,r0,ror#16 - eor r10,r10,r1,ror#16 - add r8,r8,r12 - mov r4,r4,ror#20 - add r9,r9,r10 - mov r5,r5,ror#20 - eor r4,r4,r8,ror#20 - eor r5,r5,r9,ror#20 - add r0,r0,r4 - mov r12,r12,ror#24 - add r1,r1,r5 - mov r10,r10,ror#24 - eor r12,r12,r0,ror#24 - eor r10,r10,r1,ror#24 - add r8,r8,r12 - mov r4,r4,ror#25 - add r9,r9,r10 - mov r5,r5,ror#25 - str r10,[sp,#4*(16+13)] - ldr r10,[sp,#4*(16+15)] - eor r4,r4,r8,ror#25 - eor r5,r5,r9,ror#25 - str r8,[sp,#4*(16+8)] - ldr r8,[sp,#4*(16+10)] - add r2,r2,r6 - mov r14,r14,ror#16 - str r9,[sp,#4*(16+9)] - ldr r9,[sp,#4*(16+11)] - add r3,r3,r7 - mov r10,r10,ror#16 - eor r14,r14,r2,ror#16 - eor r10,r10,r3,ror#16 - add r8,r8,r14 - mov r6,r6,ror#20 - add r9,r9,r10 - mov r7,r7,ror#20 - eor r6,r6,r8,ror#20 - eor r7,r7,r9,ror#20 - add r2,r2,r6 - mov r14,r14,ror#24 - add r3,r3,r7 - mov r10,r10,ror#24 - eor r14,r14,r2,ror#24 - eor r10,r10,r3,ror#24 - add r8,r8,r14 - mov r6,r6,ror#25 - add r9,r9,r10 - mov r7,r7,ror#25 - eor r6,r6,r8,ror#25 - eor r7,r7,r9,ror#25 - add r0,r0,r5 - mov r10,r10,ror#16 - add r1,r1,r6 - mov r12,r12,ror#16 - eor r10,r10,r0,ror#16 - eor r12,r12,r1,ror#16 - add r8,r8,r10 - mov r5,r5,ror#20 - add r9,r9,r12 - mov r6,r6,ror#20 - eor r5,r5,r8,ror#20 - eor r6,r6,r9,ror#20 - add r0,r0,r5 - mov r10,r10,ror#24 - add r1,r1,r6 - mov r12,r12,ror#24 - eor r10,r10,r0,ror#24 - eor r12,r12,r1,ror#24 - add r8,r8,r10 - mov r5,r5,ror#25 - str r10,[sp,#4*(16+15)] - ldr r10,[sp,#4*(16+13)] - add r9,r9,r12 - mov r6,r6,ror#25 - eor r5,r5,r8,ror#25 - eor r6,r6,r9,ror#25 - str r8,[sp,#4*(16+10)] - ldr r8,[sp,#4*(16+8)] - add r2,r2,r7 - mov r10,r10,ror#16 - str r9,[sp,#4*(16+11)] - ldr r9,[sp,#4*(16+9)] - add r3,r3,r4 - mov r14,r14,ror#16 - eor r10,r10,r2,ror#16 - eor r14,r14,r3,ror#16 - add r8,r8,r10 - mov r7,r7,ror#20 - add r9,r9,r14 - mov r4,r4,ror#20 - eor r7,r7,r8,ror#20 - eor r4,r4,r9,ror#20 - add r2,r2,r7 - mov r10,r10,ror#24 - add r3,r3,r4 - mov r14,r14,ror#24 - eor r10,r10,r2,ror#24 - eor r14,r14,r3,ror#24 - add r8,r8,r10 - mov r7,r7,ror#25 - add r9,r9,r14 - mov r4,r4,ror#25 - eor r7,r7,r8,ror#25 - eor r4,r4,r9,ror#25 - bne Loop - - ldr r11,[sp,#4*(32+2)] @ load len - - str r8, [sp,#4*(16+8)] @ modulo-scheduled store - str r9, [sp,#4*(16+9)] - str r12,[sp,#4*(16+12)] - str r10, [sp,#4*(16+13)] - str r14,[sp,#4*(16+14)] - - @ at this point we have first half of 512-bit result in - @ rx and second half at sp+4*(16+8) - - cmp r11,#64 @ done yet? -#ifdef __thumb2__ - itete lo -#endif - addlo r12,sp,#4*(0) @ shortcut or ... - ldrhs r12,[sp,#4*(32+1)] @ ... load inp - addlo r14,sp,#4*(0) @ shortcut or ... - ldrhs r14,[sp,#4*(32+0)] @ ... load out - - ldr r8,[sp,#4*(0)] @ load key material - ldr r9,[sp,#4*(1)] - -#if __ARM_ARCH>=6 || !defined(__ARMEB__) -# if __ARM_ARCH<7 - orr r10,r12,r14 - tst r10,#3 @ are input and output aligned? - ldr r10,[sp,#4*(2)] - bne Lunaligned - cmp r11,#64 @ restore flags -# else - ldr r10,[sp,#4*(2)] -# endif - ldr r11,[sp,#4*(3)] - - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - - add r2,r2,r10 - add r3,r3,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH>=6 && defined(__ARMEB__) - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r0,r0,r8 @ xor with input - eorhs r1,r1,r9 - add r8,sp,#4*(4) - str r0,[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs r2,r2,r10 - eorhs r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r1,[r14,#-12] - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - add r5,r5,r9 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - add r6,r6,r10 - add r7,r7,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH>=6 && defined(__ARMEB__) - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r4,r4,r8 - eorhs r5,r5,r9 - add r8,sp,#4*(8) - str r4,[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs r6,r6,r10 - eorhs r7,r7,r11 - str r5,[r14,#-12] - ldmia r8,{r8,r9,r10,r11} @ load key material - str r6,[r14,#-8] - add r0,sp,#4*(16+8) - str r7,[r14,#-4] - - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half - - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] -# ifdef __thumb2__ - itt hi -# endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it - strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it - add r2,r2,r10 - add r3,r3,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH>=6 && defined(__ARMEB__) - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r0,r0,r8 - eorhs r1,r1,r9 - add r8,sp,#4*(12) - str r0,[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs r2,r2,r10 - eorhs r3,r3,r11 - str r1,[r14,#-12] - ldmia r8,{r8,r9,r10,r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - add r5,r5,r9 -# ifdef __thumb2__ - itt hi -# endif - addhi r8,r8,#1 @ next counter value - strhi r8,[sp,#4*(12)] @ save next counter value -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - add r6,r6,r10 - add r7,r7,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH>=6 && defined(__ARMEB__) - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r4,r4,r8 - eorhs r5,r5,r9 -# ifdef __thumb2__ - it ne -# endif - ldrne r8,[sp,#4*(32+2)] @ re-load len -# ifdef __thumb2__ - itt hs -# endif - eorhs r6,r6,r10 - eorhs r7,r7,r11 - str r4,[r14],#16 @ store output - str r5,[r14,#-12] -# ifdef __thumb2__ - it hs -# endif - subhs r11,r8,#64 @ len-=64 - str r6,[r14,#-8] - str r7,[r14,#-4] - bhi Loop_outer - - beq Ldone -# if __ARM_ARCH<7 - b Ltail - -.align 4 -Lunaligned:@ unaligned endian-neutral path - cmp r11,#64 @ restore flags -# endif -#endif -#if __ARM_ARCH<7 - ldr r11,[sp,#4*(3)] - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 - add r2,r2,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r3,r3,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r0,r8,r0 @ xor with input (or zero) - eor r1,r9,r1 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r2,r10,r2 - strb r0,[r14],#16 @ store output - eor r3,r11,r3 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r1,[r14,#-12] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-8] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r3,[r14,#-4] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-15] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r1,[r14,#-11] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-7] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r3,[r14,#-3] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-14] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r1,[r14,#-10] - strb r2,[r14,#-6] - eor r0,r8,r0,lsr#8 - strb r3,[r14,#-2] - eor r1,r9,r1,lsr#8 - strb r0,[r14,#-13] - eor r2,r10,r2,lsr#8 - strb r1,[r14,#-9] - eor r3,r11,r3,lsr#8 - strb r2,[r14,#-5] - strb r3,[r14,#-1] - add r8,sp,#4*(4+0) - ldmia r8,{r8,r9,r10,r11} @ load key material - add r0,sp,#4*(16+8) - add r4,r4,r8 @ accumulate key material - add r5,r5,r9 - add r6,r6,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r7,r7,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r4,r8,r4 @ xor with input (or zero) - eor r5,r9,r5 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r6,r10,r6 - strb r4,[r14],#16 @ store output - eor r7,r11,r7 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r5,[r14,#-12] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-8] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r7,[r14,#-4] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-15] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r5,[r14,#-11] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-7] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r7,[r14,#-3] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-14] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r5,[r14,#-10] - strb r6,[r14,#-6] - eor r4,r8,r4,lsr#8 - strb r7,[r14,#-2] - eor r5,r9,r5,lsr#8 - strb r4,[r14,#-13] - eor r6,r10,r6,lsr#8 - strb r5,[r14,#-9] - eor r7,r11,r7,lsr#8 - strb r6,[r14,#-5] - strb r7,[r14,#-1] - add r8,sp,#4*(4+4) - ldmia r8,{r8,r9,r10,r11} @ load key material - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half -# ifdef __thumb2__ - itt hi -# endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" - strhi r11,[sp,#4*(16+11)] @ copy "rx" - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 - add r2,r2,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r3,r3,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r0,r8,r0 @ xor with input (or zero) - eor r1,r9,r1 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r2,r10,r2 - strb r0,[r14],#16 @ store output - eor r3,r11,r3 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r1,[r14,#-12] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-8] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r3,[r14,#-4] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-15] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r1,[r14,#-11] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-7] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r3,[r14,#-3] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-14] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r1,[r14,#-10] - strb r2,[r14,#-6] - eor r0,r8,r0,lsr#8 - strb r3,[r14,#-2] - eor r1,r9,r1,lsr#8 - strb r0,[r14,#-13] - eor r2,r10,r2,lsr#8 - strb r1,[r14,#-9] - eor r3,r11,r3,lsr#8 - strb r2,[r14,#-5] - strb r3,[r14,#-1] - add r8,sp,#4*(4+8) - ldmia r8,{r8,r9,r10,r11} @ load key material - add r4,r4,r8 @ accumulate key material -# ifdef __thumb2__ - itt hi -# endif - addhi r8,r8,#1 @ next counter value - strhi r8,[sp,#4*(12)] @ save next counter value - add r5,r5,r9 - add r6,r6,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r7,r7,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r4,r8,r4 @ xor with input (or zero) - eor r5,r9,r5 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r6,r10,r6 - strb r4,[r14],#16 @ store output - eor r7,r11,r7 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r5,[r14,#-12] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-8] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r7,[r14,#-4] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-15] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r5,[r14,#-11] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-7] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r7,[r14,#-3] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-14] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r5,[r14,#-10] - strb r6,[r14,#-6] - eor r4,r8,r4,lsr#8 - strb r7,[r14,#-2] - eor r5,r9,r5,lsr#8 - strb r4,[r14,#-13] - eor r6,r10,r6,lsr#8 - strb r5,[r14,#-9] - eor r7,r11,r7,lsr#8 - strb r6,[r14,#-5] - strb r7,[r14,#-1] -# ifdef __thumb2__ - it ne -# endif - ldrne r8,[sp,#4*(32+2)] @ re-load len -# ifdef __thumb2__ - it hs -# endif - subhs r11,r8,#64 @ len-=64 - bhi Loop_outer - - beq Ldone -#endif - -Ltail: - ldr r12,[sp,#4*(32+1)] @ load inp - add r9,sp,#4*(0) - ldr r14,[sp,#4*(32+0)] @ load out - -Loop_tail: - ldrb r10,[r9],#1 @ read buffer on stack - ldrb r11,[r12],#1 @ read input - subs r8,r8,#1 - eor r11,r11,r10 - strb r11,[r14],#1 @ store output - bne Loop_tail - -Ldone: - add sp,sp,#4*(32+3) - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} - -#if __ARM_MAX_ARCH__>=7 - - - -.globl _ChaCha20_ctr32_neon -.private_extern _ChaCha20_ctr32_neon -#ifdef __thumb2__ -.thumb_func _ChaCha20_ctr32_neon -#endif -.align 5 -_ChaCha20_ctr32_neon: - ldr r12,[sp,#0] @ pull pointer to counter and nonce - stmdb sp!,{r0,r1,r2,r4-r11,lr} - adr r14,Lsigma - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so - stmdb sp!,{r0,r1,r2,r3} - - vld1.32 {q1,q2},[r3] @ load key - ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key - - sub sp,sp,#4*(16+16) - vld1.32 {q3},[r12] @ load counter and nonce - add r12,sp,#4*8 - ldmia r14,{r0,r1,r2,r3} @ load sigma - vld1.32 {q0},[r14]! @ load sigma - vld1.32 {q12},[r14] @ one - vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce - vst1.32 {q0,q1},[sp] @ copy sigma|1/2key - - str r10,[sp,#4*(16+10)] @ off-load "rx" - str r11,[sp,#4*(16+11)] @ off-load "rx" - vshl.i32 d26,d24,#1 @ two - vstr d24,[sp,#4*(16+0)] - vshl.i32 d28,d24,#2 @ four - vstr d26,[sp,#4*(16+2)] - vmov q4,q0 - vstr d28,[sp,#4*(16+4)] - vmov q8,q0 - vmov q5,q1 - vmov q9,q1 - b Loop_neon_enter - -.align 4 -Loop_neon_outer: - ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material - cmp r11,#64*2 @ if len<=64*2 - bls Lbreak_neon @ switch to integer-only - vmov q4,q0 - str r11,[sp,#4*(32+2)] @ save len - vmov q8,q0 - str r12, [sp,#4*(32+1)] @ save inp - vmov q5,q1 - str r14, [sp,#4*(32+0)] @ save out - vmov q9,q1 -Loop_neon_enter: - ldr r11, [sp,#4*(15)] - vadd.i32 q7,q3,q12 @ counter+1 - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - vmov q6,q2 - ldr r10, [sp,#4*(13)] - vmov q10,q2 - ldr r14,[sp,#4*(14)] - vadd.i32 q11,q7,q12 @ counter+2 - str r11, [sp,#4*(16+15)] - mov r11,#10 - add r12,r12,#3 @ counter+3 - b Loop_neon - -.align 4 -Loop_neon: - subs r11,r11,#1 - vadd.i32 q0,q0,q1 - add r0,r0,r4 - vadd.i32 q4,q4,q5 - mov r12,r12,ror#16 - vadd.i32 q8,q8,q9 - add r1,r1,r5 - veor q3,q3,q0 - mov r10,r10,ror#16 - veor q7,q7,q4 - eor r12,r12,r0,ror#16 - veor q11,q11,q8 - eor r10,r10,r1,ror#16 - vrev32.16 q3,q3 - add r8,r8,r12 - vrev32.16 q7,q7 - mov r4,r4,ror#20 - vrev32.16 q11,q11 - add r9,r9,r10 - vadd.i32 q2,q2,q3 - mov r5,r5,ror#20 - vadd.i32 q6,q6,q7 - eor r4,r4,r8,ror#20 - vadd.i32 q10,q10,q11 - eor r5,r5,r9,ror#20 - veor q12,q1,q2 - add r0,r0,r4 - veor q13,q5,q6 - mov r12,r12,ror#24 - veor q14,q9,q10 - add r1,r1,r5 - vshr.u32 q1,q12,#20 - mov r10,r10,ror#24 - vshr.u32 q5,q13,#20 - eor r12,r12,r0,ror#24 - vshr.u32 q9,q14,#20 - eor r10,r10,r1,ror#24 - vsli.32 q1,q12,#12 - add r8,r8,r12 - vsli.32 q5,q13,#12 - mov r4,r4,ror#25 - vsli.32 q9,q14,#12 - add r9,r9,r10 - vadd.i32 q0,q0,q1 - mov r5,r5,ror#25 - vadd.i32 q4,q4,q5 - str r10,[sp,#4*(16+13)] - vadd.i32 q8,q8,q9 - ldr r10,[sp,#4*(16+15)] - veor q12,q3,q0 - eor r4,r4,r8,ror#25 - veor q13,q7,q4 - eor r5,r5,r9,ror#25 - veor q14,q11,q8 - str r8,[sp,#4*(16+8)] - vshr.u32 q3,q12,#24 - ldr r8,[sp,#4*(16+10)] - vshr.u32 q7,q13,#24 - add r2,r2,r6 - vshr.u32 q11,q14,#24 - mov r14,r14,ror#16 - vsli.32 q3,q12,#8 - str r9,[sp,#4*(16+9)] - vsli.32 q7,q13,#8 - ldr r9,[sp,#4*(16+11)] - vsli.32 q11,q14,#8 - add r3,r3,r7 - vadd.i32 q2,q2,q3 - mov r10,r10,ror#16 - vadd.i32 q6,q6,q7 - eor r14,r14,r2,ror#16 - vadd.i32 q10,q10,q11 - eor r10,r10,r3,ror#16 - veor q12,q1,q2 - add r8,r8,r14 - veor q13,q5,q6 - mov r6,r6,ror#20 - veor q14,q9,q10 - add r9,r9,r10 - vshr.u32 q1,q12,#25 - mov r7,r7,ror#20 - vshr.u32 q5,q13,#25 - eor r6,r6,r8,ror#20 - vshr.u32 q9,q14,#25 - eor r7,r7,r9,ror#20 - vsli.32 q1,q12,#7 - add r2,r2,r6 - vsli.32 q5,q13,#7 - mov r14,r14,ror#24 - vsli.32 q9,q14,#7 - add r3,r3,r7 - vext.8 q2,q2,q2,#8 - mov r10,r10,ror#24 - vext.8 q6,q6,q6,#8 - eor r14,r14,r2,ror#24 - vext.8 q10,q10,q10,#8 - eor r10,r10,r3,ror#24 - vext.8 q1,q1,q1,#4 - add r8,r8,r14 - vext.8 q5,q5,q5,#4 - mov r6,r6,ror#25 - vext.8 q9,q9,q9,#4 - add r9,r9,r10 - vext.8 q3,q3,q3,#12 - mov r7,r7,ror#25 - vext.8 q7,q7,q7,#12 - eor r6,r6,r8,ror#25 - vext.8 q11,q11,q11,#12 - eor r7,r7,r9,ror#25 - vadd.i32 q0,q0,q1 - add r0,r0,r5 - vadd.i32 q4,q4,q5 - mov r10,r10,ror#16 - vadd.i32 q8,q8,q9 - add r1,r1,r6 - veor q3,q3,q0 - mov r12,r12,ror#16 - veor q7,q7,q4 - eor r10,r10,r0,ror#16 - veor q11,q11,q8 - eor r12,r12,r1,ror#16 - vrev32.16 q3,q3 - add r8,r8,r10 - vrev32.16 q7,q7 - mov r5,r5,ror#20 - vrev32.16 q11,q11 - add r9,r9,r12 - vadd.i32 q2,q2,q3 - mov r6,r6,ror#20 - vadd.i32 q6,q6,q7 - eor r5,r5,r8,ror#20 - vadd.i32 q10,q10,q11 - eor r6,r6,r9,ror#20 - veor q12,q1,q2 - add r0,r0,r5 - veor q13,q5,q6 - mov r10,r10,ror#24 - veor q14,q9,q10 - add r1,r1,r6 - vshr.u32 q1,q12,#20 - mov r12,r12,ror#24 - vshr.u32 q5,q13,#20 - eor r10,r10,r0,ror#24 - vshr.u32 q9,q14,#20 - eor r12,r12,r1,ror#24 - vsli.32 q1,q12,#12 - add r8,r8,r10 - vsli.32 q5,q13,#12 - mov r5,r5,ror#25 - vsli.32 q9,q14,#12 - str r10,[sp,#4*(16+15)] - vadd.i32 q0,q0,q1 - ldr r10,[sp,#4*(16+13)] - vadd.i32 q4,q4,q5 - add r9,r9,r12 - vadd.i32 q8,q8,q9 - mov r6,r6,ror#25 - veor q12,q3,q0 - eor r5,r5,r8,ror#25 - veor q13,q7,q4 - eor r6,r6,r9,ror#25 - veor q14,q11,q8 - str r8,[sp,#4*(16+10)] - vshr.u32 q3,q12,#24 - ldr r8,[sp,#4*(16+8)] - vshr.u32 q7,q13,#24 - add r2,r2,r7 - vshr.u32 q11,q14,#24 - mov r10,r10,ror#16 - vsli.32 q3,q12,#8 - str r9,[sp,#4*(16+11)] - vsli.32 q7,q13,#8 - ldr r9,[sp,#4*(16+9)] - vsli.32 q11,q14,#8 - add r3,r3,r4 - vadd.i32 q2,q2,q3 - mov r14,r14,ror#16 - vadd.i32 q6,q6,q7 - eor r10,r10,r2,ror#16 - vadd.i32 q10,q10,q11 - eor r14,r14,r3,ror#16 - veor q12,q1,q2 - add r8,r8,r10 - veor q13,q5,q6 - mov r7,r7,ror#20 - veor q14,q9,q10 - add r9,r9,r14 - vshr.u32 q1,q12,#25 - mov r4,r4,ror#20 - vshr.u32 q5,q13,#25 - eor r7,r7,r8,ror#20 - vshr.u32 q9,q14,#25 - eor r4,r4,r9,ror#20 - vsli.32 q1,q12,#7 - add r2,r2,r7 - vsli.32 q5,q13,#7 - mov r10,r10,ror#24 - vsli.32 q9,q14,#7 - add r3,r3,r4 - vext.8 q2,q2,q2,#8 - mov r14,r14,ror#24 - vext.8 q6,q6,q6,#8 - eor r10,r10,r2,ror#24 - vext.8 q10,q10,q10,#8 - eor r14,r14,r3,ror#24 - vext.8 q1,q1,q1,#12 - add r8,r8,r10 - vext.8 q5,q5,q5,#12 - mov r7,r7,ror#25 - vext.8 q9,q9,q9,#12 - add r9,r9,r14 - vext.8 q3,q3,q3,#4 - mov r4,r4,ror#25 - vext.8 q7,q7,q7,#4 - eor r7,r7,r8,ror#25 - vext.8 q11,q11,q11,#4 - eor r4,r4,r9,ror#25 - bne Loop_neon - - add r11,sp,#32 - vld1.32 {q12,q13},[sp] @ load key material - vld1.32 {q14,q15},[r11] - - ldr r11,[sp,#4*(32+2)] @ load len - - str r8, [sp,#4*(16+8)] @ modulo-scheduled store - str r9, [sp,#4*(16+9)] - str r12,[sp,#4*(16+12)] - str r10, [sp,#4*(16+13)] - str r14,[sp,#4*(16+14)] - - @ at this point we have first half of 512-bit result in - @ rx and second half at sp+4*(16+8) - - ldr r12,[sp,#4*(32+1)] @ load inp - ldr r14,[sp,#4*(32+0)] @ load out - - vadd.i32 q0,q0,q12 @ accumulate key material - vadd.i32 q4,q4,q12 - vadd.i32 q8,q8,q12 - vldr d24,[sp,#4*(16+0)] @ one - - vadd.i32 q1,q1,q13 - vadd.i32 q5,q5,q13 - vadd.i32 q9,q9,q13 - vldr d26,[sp,#4*(16+2)] @ two - - vadd.i32 q2,q2,q14 - vadd.i32 q6,q6,q14 - vadd.i32 q10,q10,q14 - vadd.i32 d14,d14,d24 @ counter+1 - vadd.i32 d22,d22,d26 @ counter+2 - - vadd.i32 q3,q3,q15 - vadd.i32 q7,q7,q15 - vadd.i32 q11,q11,q15 - - cmp r11,#64*4 - blo Ltail_neon - - vld1.8 {q12,q13},[r12]! @ load input - mov r11,sp - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 @ xor with input - veor q1,q1,q13 - vld1.8 {q12,q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14,q15},[r12]! - - veor q4,q4,q12 - vst1.8 {q0,q1},[r14]! @ store output - veor q5,q5,q13 - vld1.8 {q12,q13},[r12]! - veor q6,q6,q14 - vst1.8 {q2,q3},[r14]! - veor q7,q7,q15 - vld1.8 {q14,q15},[r12]! - - veor q8,q8,q12 - vld1.32 {q0,q1},[r11]! @ load for next iteration - veor d25,d25,d25 - vldr d24,[sp,#4*(16+4)] @ four - veor q9,q9,q13 - vld1.32 {q2,q3},[r11] - veor q10,q10,q14 - vst1.8 {q4,q5},[r14]! - veor q11,q11,q15 - vst1.8 {q6,q7},[r14]! - - vadd.i32 d6,d6,d24 @ next counter value - vldr d24,[sp,#4*(16+0)] @ one - - ldmia sp,{r8,r9,r10,r11} @ load key material - add r0,r0,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - vst1.8 {q8,q9},[r14]! - add r1,r1,r9 - ldr r9,[r12,#-12] - vst1.8 {q10,q11},[r14]! - add r2,r2,r10 - ldr r10,[r12,#-8] - add r3,r3,r11 - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif - eor r0,r0,r8 @ xor with input - add r8,sp,#4*(4) - eor r1,r1,r9 - str r0,[r14],#16 @ store output - eor r2,r2,r10 - str r1,[r14,#-12] - eor r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - add r5,r5,r9 - ldr r9,[r12,#-12] - add r6,r6,r10 - ldr r10,[r12,#-8] - add r7,r7,r11 - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - eor r4,r4,r8 - add r8,sp,#4*(8) - eor r5,r5,r9 - str r4,[r14],#16 @ store output - eor r6,r6,r10 - str r5,[r14,#-12] - eor r7,r7,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r6,[r14,#-8] - add r0,sp,#4*(16+8) - str r7,[r14,#-4] - - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half - - add r0,r0,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - add r1,r1,r9 - ldr r9,[r12,#-12] -# ifdef __thumb2__ - it hi -# endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it - add r2,r2,r10 - ldr r10,[r12,#-8] -# ifdef __thumb2__ - it hi -# endif - strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it - add r3,r3,r11 - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif - eor r0,r0,r8 - add r8,sp,#4*(12) - eor r1,r1,r9 - str r0,[r14],#16 @ store output - eor r2,r2,r10 - str r1,[r14,#-12] - eor r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - add r8,r8,#4 @ next counter value - add r5,r5,r9 - str r8,[sp,#4*(12)] @ save next counter value - ldr r8,[r12],#16 @ load input - add r6,r6,r10 - add r4,r4,#3 @ counter+3 - ldr r9,[r12,#-12] - add r7,r7,r11 - ldr r10,[r12,#-8] - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - eor r4,r4,r8 -# ifdef __thumb2__ - it hi -# endif - ldrhi r8,[sp,#4*(32+2)] @ re-load len - eor r5,r5,r9 - eor r6,r6,r10 - str r4,[r14],#16 @ store output - eor r7,r7,r11 - str r5,[r14,#-12] - sub r11,r8,#64*4 @ len-=64*4 - str r6,[r14,#-8] - str r7,[r14,#-4] - bhi Loop_neon_outer - - b Ldone_neon - -.align 4 -Lbreak_neon: - @ harmonize NEON and integer-only stack frames: load data - @ from NEON frame, but save to integer-only one; distance - @ between the two is 4*(32+4+16-32)=4*(20). - - str r11, [sp,#4*(20+32+2)] @ save len - add r11,sp,#4*(32+4) - str r12, [sp,#4*(20+32+1)] @ save inp - str r14, [sp,#4*(20+32+0)] @ save out - - ldr r12,[sp,#4*(16+10)] - ldr r14,[sp,#4*(16+11)] - vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement - str r12,[sp,#4*(20+16+10)] @ copy "rx" - str r14,[sp,#4*(20+16+11)] @ copy "rx" - - ldr r11, [sp,#4*(15)] - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - ldr r10, [sp,#4*(13)] - ldr r14,[sp,#4*(14)] - str r11, [sp,#4*(20+16+15)] - add r11,sp,#4*(20) - vst1.32 {q0,q1},[r11]! @ copy key - add sp,sp,#4*(20) @ switch frame - vst1.32 {q2,q3},[r11] - mov r11,#10 - b Loop @ go integer-only - -.align 4 -Ltail_neon: - cmp r11,#64*3 - bhs L192_or_more_neon - cmp r11,#64*2 - bhs L128_or_more_neon - cmp r11,#64*1 - bhs L64_or_more_neon - - add r8,sp,#4*(8) - vst1.8 {q0,q1},[sp] - add r10,sp,#4*(0) - vst1.8 {q2,q3},[r8] - b Loop_tail_neon - -.align 4 -L64_or_more_neon: - vld1.8 {q12,q13},[r12]! - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - veor q2,q2,q14 - veor q3,q3,q15 - vst1.8 {q0,q1},[r14]! - vst1.8 {q2,q3},[r14]! - - beq Ldone_neon - - add r8,sp,#4*(8) - vst1.8 {q4,q5},[sp] - add r10,sp,#4*(0) - vst1.8 {q6,q7},[r8] - sub r11,r11,#64*1 @ len-=64*1 - b Loop_tail_neon - -.align 4 -L128_or_more_neon: - vld1.8 {q12,q13},[r12]! - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - vld1.8 {q12,q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14,q15},[r12]! - - veor q4,q4,q12 - veor q5,q5,q13 - vst1.8 {q0,q1},[r14]! - veor q6,q6,q14 - vst1.8 {q2,q3},[r14]! - veor q7,q7,q15 - vst1.8 {q4,q5},[r14]! - vst1.8 {q6,q7},[r14]! - - beq Ldone_neon - - add r8,sp,#4*(8) - vst1.8 {q8,q9},[sp] - add r10,sp,#4*(0) - vst1.8 {q10,q11},[r8] - sub r11,r11,#64*2 @ len-=64*2 - b Loop_tail_neon - -.align 4 -L192_or_more_neon: - vld1.8 {q12,q13},[r12]! - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - vld1.8 {q12,q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14,q15},[r12]! - - veor q4,q4,q12 - veor q5,q5,q13 - vld1.8 {q12,q13},[r12]! - veor q6,q6,q14 - vst1.8 {q0,q1},[r14]! - veor q7,q7,q15 - vld1.8 {q14,q15},[r12]! - - veor q8,q8,q12 - vst1.8 {q2,q3},[r14]! - veor q9,q9,q13 - vst1.8 {q4,q5},[r14]! - veor q10,q10,q14 - vst1.8 {q6,q7},[r14]! - veor q11,q11,q15 - vst1.8 {q8,q9},[r14]! - vst1.8 {q10,q11},[r14]! - - beq Ldone_neon - - ldmia sp,{r8,r9,r10,r11} @ load key material - add r0,r0,r8 @ accumulate key material - add r8,sp,#4*(4) - add r1,r1,r9 - add r2,r2,r10 - add r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - - add r4,r4,r8 @ accumulate key material - add r8,sp,#4*(8) - add r5,r5,r9 - add r6,r6,r10 - add r7,r7,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7} - add r0,sp,#4*(16+8) - - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half - - add r0,r0,r8 @ accumulate key material - add r8,sp,#4*(12) - add r1,r1,r9 - add r2,r2,r10 - add r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - - add r4,r4,r8 @ accumulate key material - add r8,sp,#4*(8) - add r5,r5,r9 - add r4,r4,#3 @ counter+3 - add r6,r6,r10 - add r7,r7,r11 - ldr r11,[sp,#4*(32+2)] @ re-load len -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7} - add r10,sp,#4*(0) - sub r11,r11,#64*3 @ len-=64*3 - -Loop_tail_neon: - ldrb r8,[r10],#1 @ read buffer on stack - ldrb r9,[r12],#1 @ read input - subs r11,r11,#1 - eor r8,r8,r9 - strb r8,[r14],#1 @ store output - bne Loop_tail_neon - -Ldone_neon: - add sp,sp,#4*(32+4) - vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15} - add sp,sp,#4*(16+3) - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} - -#endif -#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__) -#endif // defined(__arm__) && defined(__APPLE__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-windows.windows.x86.S deleted file mode 100644 index ef88da19..00000000 --- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-windows.windows.x86.S +++ /dev/null @@ -1,973 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -global _ChaCha20_ctr32_nohw -align 16 -_ChaCha20_ctr32_nohw: -L$_ChaCha20_ctr32_nohw_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [32+esp] - mov edi,DWORD [36+esp] - sub esp,132 - mov eax,DWORD [esi] - mov ebx,DWORD [4+esi] - mov ecx,DWORD [8+esi] - mov edx,DWORD [12+esi] - mov DWORD [80+esp],eax - mov DWORD [84+esp],ebx - mov DWORD [88+esp],ecx - mov DWORD [92+esp],edx - mov eax,DWORD [16+esi] - mov ebx,DWORD [20+esi] - mov ecx,DWORD [24+esi] - mov edx,DWORD [28+esi] - mov DWORD [96+esp],eax - mov DWORD [100+esp],ebx - mov DWORD [104+esp],ecx - mov DWORD [108+esp],edx - mov eax,DWORD [edi] - mov ebx,DWORD [4+edi] - mov ecx,DWORD [8+edi] - mov edx,DWORD [12+edi] - sub eax,1 - mov DWORD [112+esp],eax - mov DWORD [116+esp],ebx - mov DWORD [120+esp],ecx - mov DWORD [124+esp],edx - jmp NEAR L$000entry -align 16 -L$001outer_loop: - mov DWORD [156+esp],ebx - mov DWORD [152+esp],eax - mov DWORD [160+esp],ecx -L$000entry: - mov eax,1634760805 - mov DWORD [4+esp],857760878 - mov DWORD [8+esp],2036477234 - mov DWORD [12+esp],1797285236 - mov ebx,DWORD [84+esp] - mov ebp,DWORD [88+esp] - mov ecx,DWORD [104+esp] - mov esi,DWORD [108+esp] - mov edx,DWORD [116+esp] - mov edi,DWORD [120+esp] - mov DWORD [20+esp],ebx - mov DWORD [24+esp],ebp - mov DWORD [40+esp],ecx - mov DWORD [44+esp],esi - mov DWORD [52+esp],edx - mov DWORD [56+esp],edi - mov ebx,DWORD [92+esp] - mov edi,DWORD [124+esp] - mov edx,DWORD [112+esp] - mov ebp,DWORD [80+esp] - mov ecx,DWORD [96+esp] - mov esi,DWORD [100+esp] - add edx,1 - mov DWORD [28+esp],ebx - mov DWORD [60+esp],edi - mov DWORD [112+esp],edx - mov ebx,10 - jmp NEAR L$002loop -align 16 -L$002loop: - add eax,ebp - mov DWORD [128+esp],ebx - mov ebx,ebp - xor edx,eax - rol edx,16 - add ecx,edx - xor ebx,ecx - mov edi,DWORD [52+esp] - rol ebx,12 - mov ebp,DWORD [20+esp] - add eax,ebx - xor edx,eax - mov DWORD [esp],eax - rol edx,8 - mov eax,DWORD [4+esp] - add ecx,edx - mov DWORD [48+esp],edx - xor ebx,ecx - add eax,ebp - rol ebx,7 - xor edi,eax - mov DWORD [32+esp],ecx - rol edi,16 - mov DWORD [16+esp],ebx - add esi,edi - mov ecx,DWORD [40+esp] - xor ebp,esi - mov edx,DWORD [56+esp] - rol ebp,12 - mov ebx,DWORD [24+esp] - add eax,ebp - xor edi,eax - mov DWORD [4+esp],eax - rol edi,8 - mov eax,DWORD [8+esp] - add esi,edi - mov DWORD [52+esp],edi - xor ebp,esi - add eax,ebx - rol ebp,7 - xor edx,eax - mov DWORD [36+esp],esi - rol edx,16 - mov DWORD [20+esp],ebp - add ecx,edx - mov esi,DWORD [44+esp] - xor ebx,ecx - mov edi,DWORD [60+esp] - rol ebx,12 - mov ebp,DWORD [28+esp] - add eax,ebx - xor edx,eax - mov DWORD [8+esp],eax - rol edx,8 - mov eax,DWORD [12+esp] - add ecx,edx - mov DWORD [56+esp],edx - xor ebx,ecx - add eax,ebp - rol ebx,7 - xor edi,eax - rol edi,16 - mov DWORD [24+esp],ebx - add esi,edi - xor ebp,esi - rol ebp,12 - mov ebx,DWORD [20+esp] - add eax,ebp - xor edi,eax - mov DWORD [12+esp],eax - rol edi,8 - mov eax,DWORD [esp] - add esi,edi - mov edx,edi - xor ebp,esi - add eax,ebx - rol ebp,7 - xor edx,eax - rol edx,16 - mov DWORD [28+esp],ebp - add ecx,edx - xor ebx,ecx - mov edi,DWORD [48+esp] - rol ebx,12 - mov ebp,DWORD [24+esp] - add eax,ebx - xor edx,eax - mov DWORD [esp],eax - rol edx,8 - mov eax,DWORD [4+esp] - add ecx,edx - mov DWORD [60+esp],edx - xor ebx,ecx - add eax,ebp - rol ebx,7 - xor edi,eax - mov DWORD [40+esp],ecx - rol edi,16 - mov DWORD [20+esp],ebx - add esi,edi - mov ecx,DWORD [32+esp] - xor ebp,esi - mov edx,DWORD [52+esp] - rol ebp,12 - mov ebx,DWORD [28+esp] - add eax,ebp - xor edi,eax - mov DWORD [4+esp],eax - rol edi,8 - mov eax,DWORD [8+esp] - add esi,edi - mov DWORD [48+esp],edi - xor ebp,esi - add eax,ebx - rol ebp,7 - xor edx,eax - mov DWORD [44+esp],esi - rol edx,16 - mov DWORD [24+esp],ebp - add ecx,edx - mov esi,DWORD [36+esp] - xor ebx,ecx - mov edi,DWORD [56+esp] - rol ebx,12 - mov ebp,DWORD [16+esp] - add eax,ebx - xor edx,eax - mov DWORD [8+esp],eax - rol edx,8 - mov eax,DWORD [12+esp] - add ecx,edx - mov DWORD [52+esp],edx - xor ebx,ecx - add eax,ebp - rol ebx,7 - xor edi,eax - rol edi,16 - mov DWORD [28+esp],ebx - add esi,edi - xor ebp,esi - mov edx,DWORD [48+esp] - rol ebp,12 - mov ebx,DWORD [128+esp] - add eax,ebp - xor edi,eax - mov DWORD [12+esp],eax - rol edi,8 - mov eax,DWORD [esp] - add esi,edi - mov DWORD [56+esp],edi - xor ebp,esi - rol ebp,7 - dec ebx - jnz NEAR L$002loop - mov ebx,DWORD [160+esp] - add eax,1634760805 - add ebp,DWORD [80+esp] - add ecx,DWORD [96+esp] - add esi,DWORD [100+esp] - cmp ebx,64 - jb NEAR L$003tail - mov ebx,DWORD [156+esp] - add edx,DWORD [112+esp] - add edi,DWORD [120+esp] - xor eax,DWORD [ebx] - xor ebp,DWORD [16+ebx] - mov DWORD [esp],eax - mov eax,DWORD [152+esp] - xor ecx,DWORD [32+ebx] - xor esi,DWORD [36+ebx] - xor edx,DWORD [48+ebx] - xor edi,DWORD [56+ebx] - mov DWORD [16+eax],ebp - mov DWORD [32+eax],ecx - mov DWORD [36+eax],esi - mov DWORD [48+eax],edx - mov DWORD [56+eax],edi - mov ebp,DWORD [4+esp] - mov ecx,DWORD [8+esp] - mov esi,DWORD [12+esp] - mov edx,DWORD [20+esp] - mov edi,DWORD [24+esp] - add ebp,857760878 - add ecx,2036477234 - add esi,1797285236 - add edx,DWORD [84+esp] - add edi,DWORD [88+esp] - xor ebp,DWORD [4+ebx] - xor ecx,DWORD [8+ebx] - xor esi,DWORD [12+ebx] - xor edx,DWORD [20+ebx] - xor edi,DWORD [24+ebx] - mov DWORD [4+eax],ebp - mov DWORD [8+eax],ecx - mov DWORD [12+eax],esi - mov DWORD [20+eax],edx - mov DWORD [24+eax],edi - mov ebp,DWORD [28+esp] - mov ecx,DWORD [40+esp] - mov esi,DWORD [44+esp] - mov edx,DWORD [52+esp] - mov edi,DWORD [60+esp] - add ebp,DWORD [92+esp] - add ecx,DWORD [104+esp] - add esi,DWORD [108+esp] - add edx,DWORD [116+esp] - add edi,DWORD [124+esp] - xor ebp,DWORD [28+ebx] - xor ecx,DWORD [40+ebx] - xor esi,DWORD [44+ebx] - xor edx,DWORD [52+ebx] - xor edi,DWORD [60+ebx] - lea ebx,[64+ebx] - mov DWORD [28+eax],ebp - mov ebp,DWORD [esp] - mov DWORD [40+eax],ecx - mov ecx,DWORD [160+esp] - mov DWORD [44+eax],esi - mov DWORD [52+eax],edx - mov DWORD [60+eax],edi - mov DWORD [eax],ebp - lea eax,[64+eax] - sub ecx,64 - jnz NEAR L$001outer_loop - jmp NEAR L$004done -L$003tail: - add edx,DWORD [112+esp] - add edi,DWORD [120+esp] - mov DWORD [esp],eax - mov DWORD [16+esp],ebp - mov DWORD [32+esp],ecx - mov DWORD [36+esp],esi - mov DWORD [48+esp],edx - mov DWORD [56+esp],edi - mov ebp,DWORD [4+esp] - mov ecx,DWORD [8+esp] - mov esi,DWORD [12+esp] - mov edx,DWORD [20+esp] - mov edi,DWORD [24+esp] - add ebp,857760878 - add ecx,2036477234 - add esi,1797285236 - add edx,DWORD [84+esp] - add edi,DWORD [88+esp] - mov DWORD [4+esp],ebp - mov DWORD [8+esp],ecx - mov DWORD [12+esp],esi - mov DWORD [20+esp],edx - mov DWORD [24+esp],edi - mov ebp,DWORD [28+esp] - mov ecx,DWORD [40+esp] - mov esi,DWORD [44+esp] - mov edx,DWORD [52+esp] - mov edi,DWORD [60+esp] - add ebp,DWORD [92+esp] - add ecx,DWORD [104+esp] - add esi,DWORD [108+esp] - add edx,DWORD [116+esp] - add edi,DWORD [124+esp] - mov DWORD [28+esp],ebp - mov ebp,DWORD [156+esp] - mov DWORD [40+esp],ecx - mov ecx,DWORD [152+esp] - mov DWORD [44+esp],esi - xor esi,esi - mov DWORD [52+esp],edx - mov DWORD [60+esp],edi - xor eax,eax - xor edx,edx -L$005tail_loop: - mov al,BYTE [ebp*1+esi] - mov dl,BYTE [esi*1+esp] - lea esi,[1+esi] - xor al,dl - mov BYTE [esi*1+ecx-1],al - dec ebx - jnz NEAR L$005tail_loop -L$004done: - add esp,132 - pop edi - pop esi - pop ebx - pop ebp - ret -global _ChaCha20_ctr32_ssse3 -align 16 -_ChaCha20_ctr32_ssse3: -L$_ChaCha20_ctr32_ssse3_begin: - push ebp - push ebx - push esi - push edi - call L$pic_point -L$pic_point: - pop eax - mov edi,DWORD [20+esp] - mov esi,DWORD [24+esp] - mov ecx,DWORD [28+esp] - mov edx,DWORD [32+esp] - mov ebx,DWORD [36+esp] - mov ebp,esp - sub esp,524 - and esp,-64 - mov DWORD [512+esp],ebp - lea eax,[(L$ssse3_data-L$pic_point)+eax] - movdqu xmm3,[ebx] - cmp ecx,256 - jb NEAR L$0061x - mov DWORD [516+esp],edx - mov DWORD [520+esp],ebx - sub ecx,256 - lea ebp,[384+esp] - movdqu xmm7,[edx] - pshufd xmm0,xmm3,0 - pshufd xmm1,xmm3,85 - pshufd xmm2,xmm3,170 - pshufd xmm3,xmm3,255 - paddd xmm0,[48+eax] - pshufd xmm4,xmm7,0 - pshufd xmm5,xmm7,85 - psubd xmm0,[64+eax] - pshufd xmm6,xmm7,170 - pshufd xmm7,xmm7,255 - movdqa [64+ebp],xmm0 - movdqa [80+ebp],xmm1 - movdqa [96+ebp],xmm2 - movdqa [112+ebp],xmm3 - movdqu xmm3,[16+edx] - movdqa [ebp-64],xmm4 - movdqa [ebp-48],xmm5 - movdqa [ebp-32],xmm6 - movdqa [ebp-16],xmm7 - movdqa xmm7,[32+eax] - lea ebx,[128+esp] - pshufd xmm0,xmm3,0 - pshufd xmm1,xmm3,85 - pshufd xmm2,xmm3,170 - pshufd xmm3,xmm3,255 - pshufd xmm4,xmm7,0 - pshufd xmm5,xmm7,85 - pshufd xmm6,xmm7,170 - pshufd xmm7,xmm7,255 - movdqa [ebp],xmm0 - movdqa [16+ebp],xmm1 - movdqa [32+ebp],xmm2 - movdqa [48+ebp],xmm3 - movdqa [ebp-128],xmm4 - movdqa [ebp-112],xmm5 - movdqa [ebp-96],xmm6 - movdqa [ebp-80],xmm7 - lea esi,[128+esi] - lea edi,[128+edi] - jmp NEAR L$007outer_loop -align 16 -L$007outer_loop: - movdqa xmm1,[ebp-112] - movdqa xmm2,[ebp-96] - movdqa xmm3,[ebp-80] - movdqa xmm5,[ebp-48] - movdqa xmm6,[ebp-32] - movdqa xmm7,[ebp-16] - movdqa [ebx-112],xmm1 - movdqa [ebx-96],xmm2 - movdqa [ebx-80],xmm3 - movdqa [ebx-48],xmm5 - movdqa [ebx-32],xmm6 - movdqa [ebx-16],xmm7 - movdqa xmm2,[32+ebp] - movdqa xmm3,[48+ebp] - movdqa xmm4,[64+ebp] - movdqa xmm5,[80+ebp] - movdqa xmm6,[96+ebp] - movdqa xmm7,[112+ebp] - paddd xmm4,[64+eax] - movdqa [32+ebx],xmm2 - movdqa [48+ebx],xmm3 - movdqa [64+ebx],xmm4 - movdqa [80+ebx],xmm5 - movdqa [96+ebx],xmm6 - movdqa [112+ebx],xmm7 - movdqa [64+ebp],xmm4 - movdqa xmm0,[ebp-128] - movdqa xmm6,xmm4 - movdqa xmm3,[ebp-64] - movdqa xmm4,[ebp] - movdqa xmm5,[16+ebp] - mov edx,10 - nop -align 16 -L$008loop: - paddd xmm0,xmm3 - movdqa xmm2,xmm3 - pxor xmm6,xmm0 - pshufb xmm6,[eax] - paddd xmm4,xmm6 - pxor xmm2,xmm4 - movdqa xmm3,[ebx-48] - movdqa xmm1,xmm2 - pslld xmm2,12 - psrld xmm1,20 - por xmm2,xmm1 - movdqa xmm1,[ebx-112] - paddd xmm0,xmm2 - movdqa xmm7,[80+ebx] - pxor xmm6,xmm0 - movdqa [ebx-128],xmm0 - pshufb xmm6,[16+eax] - paddd xmm4,xmm6 - movdqa [64+ebx],xmm6 - pxor xmm2,xmm4 - paddd xmm1,xmm3 - movdqa xmm0,xmm2 - pslld xmm2,7 - psrld xmm0,25 - pxor xmm7,xmm1 - por xmm2,xmm0 - movdqa [ebx],xmm4 - pshufb xmm7,[eax] - movdqa [ebx-64],xmm2 - paddd xmm5,xmm7 - movdqa xmm4,[32+ebx] - pxor xmm3,xmm5 - movdqa xmm2,[ebx-32] - movdqa xmm0,xmm3 - pslld xmm3,12 - psrld xmm0,20 - por xmm3,xmm0 - movdqa xmm0,[ebx-96] - paddd xmm1,xmm3 - movdqa xmm6,[96+ebx] - pxor xmm7,xmm1 - movdqa [ebx-112],xmm1 - pshufb xmm7,[16+eax] - paddd xmm5,xmm7 - movdqa [80+ebx],xmm7 - pxor xmm3,xmm5 - paddd xmm0,xmm2 - movdqa xmm1,xmm3 - pslld xmm3,7 - psrld xmm1,25 - pxor xmm6,xmm0 - por xmm3,xmm1 - movdqa [16+ebx],xmm5 - pshufb xmm6,[eax] - movdqa [ebx-48],xmm3 - paddd xmm4,xmm6 - movdqa xmm5,[48+ebx] - pxor xmm2,xmm4 - movdqa xmm3,[ebx-16] - movdqa xmm1,xmm2 - pslld xmm2,12 - psrld xmm1,20 - por xmm2,xmm1 - movdqa xmm1,[ebx-80] - paddd xmm0,xmm2 - movdqa xmm7,[112+ebx] - pxor xmm6,xmm0 - movdqa [ebx-96],xmm0 - pshufb xmm6,[16+eax] - paddd xmm4,xmm6 - movdqa [96+ebx],xmm6 - pxor xmm2,xmm4 - paddd xmm1,xmm3 - movdqa xmm0,xmm2 - pslld xmm2,7 - psrld xmm0,25 - pxor xmm7,xmm1 - por xmm2,xmm0 - pshufb xmm7,[eax] - movdqa [ebx-32],xmm2 - paddd xmm5,xmm7 - pxor xmm3,xmm5 - movdqa xmm2,[ebx-48] - movdqa xmm0,xmm3 - pslld xmm3,12 - psrld xmm0,20 - por xmm3,xmm0 - movdqa xmm0,[ebx-128] - paddd xmm1,xmm3 - pxor xmm7,xmm1 - movdqa [ebx-80],xmm1 - pshufb xmm7,[16+eax] - paddd xmm5,xmm7 - movdqa xmm6,xmm7 - pxor xmm3,xmm5 - paddd xmm0,xmm2 - movdqa xmm1,xmm3 - pslld xmm3,7 - psrld xmm1,25 - pxor xmm6,xmm0 - por xmm3,xmm1 - pshufb xmm6,[eax] - movdqa [ebx-16],xmm3 - paddd xmm4,xmm6 - pxor xmm2,xmm4 - movdqa xmm3,[ebx-32] - movdqa xmm1,xmm2 - pslld xmm2,12 - psrld xmm1,20 - por xmm2,xmm1 - movdqa xmm1,[ebx-112] - paddd xmm0,xmm2 - movdqa xmm7,[64+ebx] - pxor xmm6,xmm0 - movdqa [ebx-128],xmm0 - pshufb xmm6,[16+eax] - paddd xmm4,xmm6 - movdqa [112+ebx],xmm6 - pxor xmm2,xmm4 - paddd xmm1,xmm3 - movdqa xmm0,xmm2 - pslld xmm2,7 - psrld xmm0,25 - pxor xmm7,xmm1 - por xmm2,xmm0 - movdqa [32+ebx],xmm4 - pshufb xmm7,[eax] - movdqa [ebx-48],xmm2 - paddd xmm5,xmm7 - movdqa xmm4,[ebx] - pxor xmm3,xmm5 - movdqa xmm2,[ebx-16] - movdqa xmm0,xmm3 - pslld xmm3,12 - psrld xmm0,20 - por xmm3,xmm0 - movdqa xmm0,[ebx-96] - paddd xmm1,xmm3 - movdqa xmm6,[80+ebx] - pxor xmm7,xmm1 - movdqa [ebx-112],xmm1 - pshufb xmm7,[16+eax] - paddd xmm5,xmm7 - movdqa [64+ebx],xmm7 - pxor xmm3,xmm5 - paddd xmm0,xmm2 - movdqa xmm1,xmm3 - pslld xmm3,7 - psrld xmm1,25 - pxor xmm6,xmm0 - por xmm3,xmm1 - movdqa [48+ebx],xmm5 - pshufb xmm6,[eax] - movdqa [ebx-32],xmm3 - paddd xmm4,xmm6 - movdqa xmm5,[16+ebx] - pxor xmm2,xmm4 - movdqa xmm3,[ebx-64] - movdqa xmm1,xmm2 - pslld xmm2,12 - psrld xmm1,20 - por xmm2,xmm1 - movdqa xmm1,[ebx-80] - paddd xmm0,xmm2 - movdqa xmm7,[96+ebx] - pxor xmm6,xmm0 - movdqa [ebx-96],xmm0 - pshufb xmm6,[16+eax] - paddd xmm4,xmm6 - movdqa [80+ebx],xmm6 - pxor xmm2,xmm4 - paddd xmm1,xmm3 - movdqa xmm0,xmm2 - pslld xmm2,7 - psrld xmm0,25 - pxor xmm7,xmm1 - por xmm2,xmm0 - pshufb xmm7,[eax] - movdqa [ebx-16],xmm2 - paddd xmm5,xmm7 - pxor xmm3,xmm5 - movdqa xmm0,xmm3 - pslld xmm3,12 - psrld xmm0,20 - por xmm3,xmm0 - movdqa xmm0,[ebx-128] - paddd xmm1,xmm3 - movdqa xmm6,[64+ebx] - pxor xmm7,xmm1 - movdqa [ebx-80],xmm1 - pshufb xmm7,[16+eax] - paddd xmm5,xmm7 - movdqa [96+ebx],xmm7 - pxor xmm3,xmm5 - movdqa xmm1,xmm3 - pslld xmm3,7 - psrld xmm1,25 - por xmm3,xmm1 - dec edx - jnz NEAR L$008loop - movdqa [ebx-64],xmm3 - movdqa [ebx],xmm4 - movdqa [16+ebx],xmm5 - movdqa [64+ebx],xmm6 - movdqa [96+ebx],xmm7 - movdqa xmm1,[ebx-112] - movdqa xmm2,[ebx-96] - movdqa xmm3,[ebx-80] - paddd xmm0,[ebp-128] - paddd xmm1,[ebp-112] - paddd xmm2,[ebp-96] - paddd xmm3,[ebp-80] - movdqa xmm6,xmm0 - punpckldq xmm0,xmm1 - movdqa xmm7,xmm2 - punpckldq xmm2,xmm3 - punpckhdq xmm6,xmm1 - punpckhdq xmm7,xmm3 - movdqa xmm1,xmm0 - punpcklqdq xmm0,xmm2 - movdqa xmm3,xmm6 - punpcklqdq xmm6,xmm7 - punpckhqdq xmm1,xmm2 - punpckhqdq xmm3,xmm7 - movdqu xmm4,[esi-128] - movdqu xmm5,[esi-64] - movdqu xmm2,[esi] - movdqu xmm7,[64+esi] - lea esi,[16+esi] - pxor xmm4,xmm0 - movdqa xmm0,[ebx-64] - pxor xmm5,xmm1 - movdqa xmm1,[ebx-48] - pxor xmm6,xmm2 - movdqa xmm2,[ebx-32] - pxor xmm7,xmm3 - movdqa xmm3,[ebx-16] - movdqu [edi-128],xmm4 - movdqu [edi-64],xmm5 - movdqu [edi],xmm6 - movdqu [64+edi],xmm7 - lea edi,[16+edi] - paddd xmm0,[ebp-64] - paddd xmm1,[ebp-48] - paddd xmm2,[ebp-32] - paddd xmm3,[ebp-16] - movdqa xmm6,xmm0 - punpckldq xmm0,xmm1 - movdqa xmm7,xmm2 - punpckldq xmm2,xmm3 - punpckhdq xmm6,xmm1 - punpckhdq xmm7,xmm3 - movdqa xmm1,xmm0 - punpcklqdq xmm0,xmm2 - movdqa xmm3,xmm6 - punpcklqdq xmm6,xmm7 - punpckhqdq xmm1,xmm2 - punpckhqdq xmm3,xmm7 - movdqu xmm4,[esi-128] - movdqu xmm5,[esi-64] - movdqu xmm2,[esi] - movdqu xmm7,[64+esi] - lea esi,[16+esi] - pxor xmm4,xmm0 - movdqa xmm0,[ebx] - pxor xmm5,xmm1 - movdqa xmm1,[16+ebx] - pxor xmm6,xmm2 - movdqa xmm2,[32+ebx] - pxor xmm7,xmm3 - movdqa xmm3,[48+ebx] - movdqu [edi-128],xmm4 - movdqu [edi-64],xmm5 - movdqu [edi],xmm6 - movdqu [64+edi],xmm7 - lea edi,[16+edi] - paddd xmm0,[ebp] - paddd xmm1,[16+ebp] - paddd xmm2,[32+ebp] - paddd xmm3,[48+ebp] - movdqa xmm6,xmm0 - punpckldq xmm0,xmm1 - movdqa xmm7,xmm2 - punpckldq xmm2,xmm3 - punpckhdq xmm6,xmm1 - punpckhdq xmm7,xmm3 - movdqa xmm1,xmm0 - punpcklqdq xmm0,xmm2 - movdqa xmm3,xmm6 - punpcklqdq xmm6,xmm7 - punpckhqdq xmm1,xmm2 - punpckhqdq xmm3,xmm7 - movdqu xmm4,[esi-128] - movdqu xmm5,[esi-64] - movdqu xmm2,[esi] - movdqu xmm7,[64+esi] - lea esi,[16+esi] - pxor xmm4,xmm0 - movdqa xmm0,[64+ebx] - pxor xmm5,xmm1 - movdqa xmm1,[80+ebx] - pxor xmm6,xmm2 - movdqa xmm2,[96+ebx] - pxor xmm7,xmm3 - movdqa xmm3,[112+ebx] - movdqu [edi-128],xmm4 - movdqu [edi-64],xmm5 - movdqu [edi],xmm6 - movdqu [64+edi],xmm7 - lea edi,[16+edi] - paddd xmm0,[64+ebp] - paddd xmm1,[80+ebp] - paddd xmm2,[96+ebp] - paddd xmm3,[112+ebp] - movdqa xmm6,xmm0 - punpckldq xmm0,xmm1 - movdqa xmm7,xmm2 - punpckldq xmm2,xmm3 - punpckhdq xmm6,xmm1 - punpckhdq xmm7,xmm3 - movdqa xmm1,xmm0 - punpcklqdq xmm0,xmm2 - movdqa xmm3,xmm6 - punpcklqdq xmm6,xmm7 - punpckhqdq xmm1,xmm2 - punpckhqdq xmm3,xmm7 - movdqu xmm4,[esi-128] - movdqu xmm5,[esi-64] - movdqu xmm2,[esi] - movdqu xmm7,[64+esi] - lea esi,[208+esi] - pxor xmm4,xmm0 - pxor xmm5,xmm1 - pxor xmm6,xmm2 - pxor xmm7,xmm3 - movdqu [edi-128],xmm4 - movdqu [edi-64],xmm5 - movdqu [edi],xmm6 - movdqu [64+edi],xmm7 - lea edi,[208+edi] - sub ecx,256 - jnc NEAR L$007outer_loop - add ecx,256 - jz NEAR L$009done - mov ebx,DWORD [520+esp] - lea esi,[esi-128] - mov edx,DWORD [516+esp] - lea edi,[edi-128] - movd xmm2,DWORD [64+ebp] - movdqu xmm3,[ebx] - paddd xmm2,[96+eax] - pand xmm3,[112+eax] - por xmm3,xmm2 -L$0061x: - movdqa xmm0,[32+eax] - movdqu xmm1,[edx] - movdqu xmm2,[16+edx] - movdqa xmm6,[eax] - movdqa xmm7,[16+eax] - mov DWORD [48+esp],ebp - movdqa [esp],xmm0 - movdqa [16+esp],xmm1 - movdqa [32+esp],xmm2 - movdqa [48+esp],xmm3 - mov edx,10 - jmp NEAR L$010loop1x -align 16 -L$011outer1x: - movdqa xmm3,[80+eax] - movdqa xmm0,[esp] - movdqa xmm1,[16+esp] - movdqa xmm2,[32+esp] - paddd xmm3,[48+esp] - mov edx,10 - movdqa [48+esp],xmm3 - jmp NEAR L$010loop1x -align 16 -L$010loop1x: - paddd xmm0,xmm1 - pxor xmm3,xmm0 -db 102,15,56,0,222 - paddd xmm2,xmm3 - pxor xmm1,xmm2 - movdqa xmm4,xmm1 - psrld xmm1,20 - pslld xmm4,12 - por xmm1,xmm4 - paddd xmm0,xmm1 - pxor xmm3,xmm0 -db 102,15,56,0,223 - paddd xmm2,xmm3 - pxor xmm1,xmm2 - movdqa xmm4,xmm1 - psrld xmm1,25 - pslld xmm4,7 - por xmm1,xmm4 - pshufd xmm2,xmm2,78 - pshufd xmm1,xmm1,57 - pshufd xmm3,xmm3,147 - nop - paddd xmm0,xmm1 - pxor xmm3,xmm0 -db 102,15,56,0,222 - paddd xmm2,xmm3 - pxor xmm1,xmm2 - movdqa xmm4,xmm1 - psrld xmm1,20 - pslld xmm4,12 - por xmm1,xmm4 - paddd xmm0,xmm1 - pxor xmm3,xmm0 -db 102,15,56,0,223 - paddd xmm2,xmm3 - pxor xmm1,xmm2 - movdqa xmm4,xmm1 - psrld xmm1,25 - pslld xmm4,7 - por xmm1,xmm4 - pshufd xmm2,xmm2,78 - pshufd xmm1,xmm1,147 - pshufd xmm3,xmm3,57 - dec edx - jnz NEAR L$010loop1x - paddd xmm0,[esp] - paddd xmm1,[16+esp] - paddd xmm2,[32+esp] - paddd xmm3,[48+esp] - cmp ecx,64 - jb NEAR L$012tail - movdqu xmm4,[esi] - movdqu xmm5,[16+esi] - pxor xmm0,xmm4 - movdqu xmm4,[32+esi] - pxor xmm1,xmm5 - movdqu xmm5,[48+esi] - pxor xmm2,xmm4 - pxor xmm3,xmm5 - lea esi,[64+esi] - movdqu [edi],xmm0 - movdqu [16+edi],xmm1 - movdqu [32+edi],xmm2 - movdqu [48+edi],xmm3 - lea edi,[64+edi] - sub ecx,64 - jnz NEAR L$011outer1x - jmp NEAR L$009done -L$012tail: - movdqa [esp],xmm0 - movdqa [16+esp],xmm1 - movdqa [32+esp],xmm2 - movdqa [48+esp],xmm3 - xor eax,eax - xor edx,edx - xor ebp,ebp -L$013tail_loop: - mov al,BYTE [ebp*1+esp] - mov dl,BYTE [ebp*1+esi] - lea ebp,[1+ebp] - xor al,dl - mov BYTE [ebp*1+edi-1],al - dec ecx - jnz NEAR L$013tail_loop -L$009done: - mov esp,DWORD [512+esp] - pop edi - pop esi - pop ebx - pop ebp - ret -align 64 -L$ssse3_data: -db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 -db 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 -dd 1634760805,857760878,2036477234,1797285236 -dd 0,1,2,3 -dd 4,4,4,4 -dd 1,0,0,0 -dd 4,0,0,0 -dd 0,-1,-1,-1 -align 64 -db 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 -db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 -db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 -db 114,103,62,0 -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_des.c b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_des.c index 3fabb19a..d5aeaf96 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_des.c +++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_des.c @@ -85,17 +85,14 @@ static int des_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, } static const EVP_CIPHER evp_des_cbc = { - /* nid = */ NID_des_cbc, - /* block_size = */ 8, - /* key_len = */ 8, - /* iv_len = */ 8, - /* ctx_size = */ sizeof(EVP_DES_KEY), - /* flags = */ EVP_CIPH_CBC_MODE, - /* app_data = */ NULL, - /* init = */ des_init_key, - /* cipher = */ des_cbc_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, + .nid = NID_des_cbc, + .block_size = 8, + .key_len = 8, + .iv_len = 8, + .ctx_size = sizeof(EVP_DES_KEY), + .flags = EVP_CIPH_CBC_MODE, + .init = des_init_key, + .cipher = des_cbc_cipher, }; const EVP_CIPHER *EVP_des_cbc(void) { return &evp_des_cbc; } @@ -115,17 +112,14 @@ static int des_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, } static const EVP_CIPHER evp_des_ecb = { - /* nid = */ NID_des_ecb, - /* block_size = */ 8, - /* key_len = */ 8, - /* iv_len = */ 0, - /* ctx_size = */ sizeof(EVP_DES_KEY), - /* flags = */ EVP_CIPH_ECB_MODE, - /* app_data = */ NULL, - /* init = */ des_init_key, - /* cipher = */ des_ecb_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, + .nid = NID_des_ecb, + .block_size = 8, + .key_len = 8, + .iv_len = 0, + .ctx_size = sizeof(EVP_DES_KEY), + .flags = EVP_CIPH_ECB_MODE, + .init = des_init_key, + .cipher = des_ecb_cipher, }; const EVP_CIPHER *EVP_des_ecb(void) { return &evp_des_ecb; } @@ -155,17 +149,14 @@ static int des_ede3_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, } static const EVP_CIPHER evp_des_ede3_cbc = { - /* nid = */ NID_des_ede3_cbc, - /* block_size = */ 8, - /* key_len = */ 24, - /* iv_len = */ 8, - /* ctx_size = */ sizeof(DES_EDE_KEY), - /* flags = */ EVP_CIPH_CBC_MODE, - /* app_data = */ NULL, - /* init = */ des_ede3_init_key, - /* cipher = */ des_ede3_cbc_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, + .nid = NID_des_ede3_cbc, + .block_size = 8, + .key_len = 24, + .iv_len = 8, + .ctx_size = sizeof(DES_EDE_KEY), + .flags = EVP_CIPH_CBC_MODE, + .init = des_ede3_init_key, + .cipher = des_ede3_cbc_cipher, }; const EVP_CIPHER *EVP_des_ede3_cbc(void) { return &evp_des_ede3_cbc; } @@ -181,17 +172,14 @@ static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, } static const EVP_CIPHER evp_des_ede_cbc = { - /* nid = */ NID_des_ede_cbc, - /* block_size = */ 8, - /* key_len = */ 16, - /* iv_len = */ 8, - /* ctx_size = */ sizeof(DES_EDE_KEY), - /* flags = */ EVP_CIPH_CBC_MODE, - /* app_data = */ NULL, - /* init = */ des_ede_init_key, - /* cipher = */ des_ede3_cbc_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, + .nid = NID_des_ede_cbc, + .block_size = 8, + .key_len = 16, + .iv_len = 8, + .ctx_size = sizeof(DES_EDE_KEY), + .flags = EVP_CIPH_CBC_MODE, + .init = des_ede_init_key, + .cipher = des_ede3_cbc_cipher, }; const EVP_CIPHER *EVP_des_ede_cbc(void) { return &evp_des_ede_cbc; } @@ -212,33 +200,27 @@ static int des_ede_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, } static const EVP_CIPHER evp_des_ede = { - /* nid = */ NID_des_ede_ecb, - /* block_size = */ 8, - /* key_len = */ 16, - /* iv_len = */ 0, - /* ctx_size = */ sizeof(DES_EDE_KEY), - /* flags = */ EVP_CIPH_ECB_MODE, - /* app_data = */ NULL, - /* init = */ des_ede_init_key, - /* cipher = */ des_ede_ecb_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, + .nid = NID_des_ede_ecb, + .block_size = 8, + .key_len = 16, + .iv_len = 0, + .ctx_size = sizeof(DES_EDE_KEY), + .flags = EVP_CIPH_ECB_MODE, + .init = des_ede_init_key, + .cipher = des_ede_ecb_cipher, }; const EVP_CIPHER *EVP_des_ede(void) { return &evp_des_ede; } static const EVP_CIPHER evp_des_ede3 = { - /* nid = */ NID_des_ede3_ecb, - /* block_size = */ 8, - /* key_len = */ 24, - /* iv_len = */ 0, - /* ctx_size = */ sizeof(DES_EDE_KEY), - /* flags = */ EVP_CIPH_ECB_MODE, - /* app_data = */ NULL, - /* init = */ des_ede3_init_key, - /* cipher = */ des_ede_ecb_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, + .nid = NID_des_ede3_ecb, + .block_size = 8, + .key_len = 24, + .iv_len = 0, + .ctx_size = sizeof(DES_EDE_KEY), + .flags = EVP_CIPH_ECB_MODE, + .init = des_ede3_init_key, + .cipher = des_ede_ecb_cipher, }; const EVP_CIPHER *EVP_des_ede3(void) { return &evp_des_ede3; } diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_null.c b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_null.c index 8245c0ea..15e2eb7b 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_null.c +++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_null.c @@ -78,9 +78,13 @@ static int null_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, } static const EVP_CIPHER n_cipher = { - NID_undef, 1 /* block size */, 0 /* key_len */, 0 /* iv_len */, - 0 /* ctx_size */, 0 /* flags */, NULL /* app_data */, null_init_key, - null_cipher, NULL /* cleanup */, NULL /* ctrl */, + .nid = NID_undef, + .block_size = 1, + .key_len = 0, + .iv_len = 0, + .ctx_size = 0, + .init = null_init_key, + .cipher = null_cipher, }; const EVP_CIPHER *EVP_enc_null(void) { return &n_cipher; } diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc2.c b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc2.c index eaf15062..695b6de0 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc2.c +++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc2.c @@ -427,37 +427,29 @@ static int rc2_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) { } static const EVP_CIPHER rc2_40_cbc = { - NID_rc2_40_cbc, - 8 /* block size */, - 5 /* 40 bit */, - 8 /* iv len */, - sizeof(EVP_RC2_KEY), - EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT, - NULL /* app_data */, - rc2_init_key, - rc2_cbc_cipher, - NULL, - rc2_ctrl, + .nid = NID_rc2_40_cbc, + .block_size = 8, + .key_len = 5 /* 40 bit */, + .iv_len = 8, + .ctx_size = sizeof(EVP_RC2_KEY), + .flags = EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT, + .init = rc2_init_key, + .cipher = rc2_cbc_cipher, + .ctrl = rc2_ctrl, }; -const EVP_CIPHER *EVP_rc2_40_cbc(void) { - return &rc2_40_cbc; -} +const EVP_CIPHER *EVP_rc2_40_cbc(void) { return &rc2_40_cbc; } static const EVP_CIPHER rc2_cbc = { - NID_rc2_cbc, - 8 /* block size */, - 16 /* 128 bit */, - 8 /* iv len */, - sizeof(EVP_RC2_KEY), - EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT, - NULL /* app_data */, - rc2_init_key, - rc2_cbc_cipher, - NULL, - rc2_ctrl, + .nid = NID_rc2_cbc, + .block_size = 8, + .key_len = 16 /* 128 bit */, + .iv_len = 8, + .ctx_size = sizeof(EVP_RC2_KEY), + .flags = EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT, + .init = rc2_init_key, + .cipher = rc2_cbc_cipher, + .ctrl = rc2_ctrl, }; -const EVP_CIPHER *EVP_rc2_cbc(void) { - return &rc2_cbc; -} +const EVP_CIPHER *EVP_rc2_cbc(void) { return &rc2_cbc; } diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc4.c b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc4.c index b96269cd..395ffdf4 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc4.c +++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc4.c @@ -81,9 +81,14 @@ static int rc4_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, } static const EVP_CIPHER rc4 = { - NID_rc4, 1 /* block_size */, 16 /* key_size */, - 0 /* iv_len */, sizeof(RC4_KEY), EVP_CIPH_VARIABLE_LENGTH, - NULL /* app_data */, rc4_init_key, rc4_cipher, - NULL /* cleanup */, NULL /* ctrl */, }; + .nid = NID_rc4, + .block_size = 1, + .key_len = 16, + .iv_len = 0, + .ctx_size = sizeof(RC4_KEY), + .flags = EVP_CIPH_VARIABLE_LENGTH, + .init = rc4_init_key, + .cipher = rc4_cipher, +}; const EVP_CIPHER *EVP_rc4(void) { return &rc4; } diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/internal.h b/Sources/CCryptoBoringSSL/crypto/cipher_extra/internal.h index a6882d4d..ef944faf 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/internal.h @@ -192,22 +192,65 @@ OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) { // Additional input parameters are passed in |aead_data->in|. On exit, it will // write calculated tag value to |aead_data->out.tag|, which the caller must // check. +#if defined(OPENSSL_X86_64) +extern void chacha20_poly1305_open_nohw( + uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data); +extern void chacha20_poly1305_open_avx2( + uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data); +OPENSSL_INLINE void chacha20_poly1305_open(uint8_t *out_plaintext, + const uint8_t *ciphertext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_open_data *data) { + if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) { + chacha20_poly1305_open_avx2(out_plaintext, ciphertext, plaintext_len, ad, + ad_len, data); + } else { + chacha20_poly1305_open_nohw(out_plaintext, ciphertext, plaintext_len, ad, + ad_len, data); + } +} +#else extern void chacha20_poly1305_open(uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len, const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data); +#endif // chacha20_poly1305_open is defined in chacha20_poly1305_*.pl. It encrypts // |plaintext_len| bytes from |plaintext| and writes them to |out_ciphertext|. // Additional input parameters are passed in |aead_data->in|. The calculated tag // value is over the computed ciphertext concatenated with |extra_ciphertext| // and written to |aead_data->out.tag|. +#if defined(OPENSSL_X86_64) +extern void chacha20_poly1305_seal_nohw( + uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data); +extern void chacha20_poly1305_seal_avx2( + uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data); +OPENSSL_INLINE void chacha20_poly1305_seal( + uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data) { + if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) { + chacha20_poly1305_seal_avx2(out_ciphertext, plaintext, plaintext_len, ad, + ad_len, data); + } else { + chacha20_poly1305_seal_nohw(out_ciphertext, plaintext, plaintext_len, ad, + ad_len, data); + } +} +#else extern void chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len, const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data); +#endif + #else OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) { return 0; } diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/tls_cbc.c b/Sources/CCryptoBoringSSL/crypto/cipher_extra/tls_cbc.c index 571b9040..87fd2e00 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/tls_cbc.c +++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/tls_cbc.c @@ -121,8 +121,8 @@ void EVP_tls_cbc_copy_mac(uint8_t *out, size_t md_size, const uint8_t *in, size_t mac_end = in_len; size_t mac_start = mac_end - md_size; - assert(orig_len >= in_len); - assert(in_len >= md_size); + declassify_assert(orig_len >= in_len); + declassify_assert(in_len >= md_size); assert(md_size <= EVP_MAX_MD_SIZE); assert(md_size > 0); diff --git a/Sources/CCryptoBoringSSL/crypto/conf/conf.c b/Sources/CCryptoBoringSSL/crypto/conf/conf.c index 4b1b6fbe..12e50f04 100644 --- a/Sources/CCryptoBoringSSL/crypto/conf/conf.c +++ b/Sources/CCryptoBoringSSL/crypto/conf/conf.c @@ -56,6 +56,7 @@ #include +#include #include #include @@ -65,7 +66,6 @@ #include #include -#include "conf_def.h" #include "internal.h" #include "../internal.h" @@ -183,6 +183,26 @@ static CONF_SECTION *NCONF_new_section(const CONF *conf, const char *section) { return NULL; } +static int is_comment(char c) { return c == '#'; } + +static int is_quote(char c) { return c == '"' || c == '\'' || c == '`'; } + +static int is_esc(char c) { return c == '\\'; } + +static int is_conf_ws(char c) { + // This differs from |OPENSSL_isspace| in that CONF does not accept '\v' and + // '\f' as whitespace. + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +} + +static int is_name_char(char c) { + // Alphanumeric characters, and a handful of symbols, may appear in value and + // section names without escaping. + return OPENSSL_isalnum(c) || c == '_' || c == '!' || c == '.' || c == '%' || + c == '&' || c == '*' || c == '+' || c == ',' || c == '/' || c == ';' || + c == '?' || c == '@' || c == '^' || c == '~' || c == '|' || c == '-'; +} + static int str_copy(CONF *conf, char *section, char **pto, char *from) { int q, to = 0, len = 0; char v; @@ -199,13 +219,13 @@ static int str_copy(CONF *conf, char *section, char **pto, char *from) { } for (;;) { - if (IS_QUOTE(conf, *from)) { + if (is_quote(*from)) { q = *from; from++; - while (!IS_EOF(conf, *from) && (*from != q)) { - if (IS_ESC(conf, *from)) { + while (*from != '\0' && *from != q) { + if (is_esc(*from)) { from++; - if (IS_EOF(conf, *from)) { + if (*from == '\0') { break; } } @@ -214,10 +234,10 @@ static int str_copy(CONF *conf, char *section, char **pto, char *from) { if (*from == q) { from++; } - } else if (IS_ESC(conf, *from)) { + } else if (is_esc(*from)) { from++; v = *(from++); - if (IS_EOF(conf, v)) { + if (v == '\0') { break; } else if (v == 'r') { v = '\r'; @@ -229,11 +249,13 @@ static int str_copy(CONF *conf, char *section, char **pto, char *from) { v = '\t'; } buf->data[to++] = v; - } else if (IS_EOF(conf, *from)) { + } else if (*from == '\0') { break; } else if (*from == '$') { // Historically, $foo would expand to a previously-parsed value. This - // feature has been removed as it was unused and is a DoS vector. + // feature has been removed as it was unused and is a DoS vector. If + // trying to embed '$' in a line, either escape it or wrap the value in + // quotes. OPENSSL_PUT_ERROR(CONF, CONF_R_VARIABLE_EXPANSION_NOT_SUPPORTED); goto err; } else { @@ -312,36 +334,39 @@ static int add_string(const CONF *conf, CONF_SECTION *section, return 1; } -static char *eat_ws(CONF *conf, char *p) { - while (IS_WS(conf, *p) && !IS_EOF(conf, *p)) { +static char *eat_ws(char *p) { + while (*p != '\0' && is_conf_ws(*p)) { p++; } return p; } -#define scan_esc(conf, p) (((IS_EOF((conf), (p)[1])) ? ((p) + 1) : ((p) + 2))) +static char *scan_esc(char *p) { + assert(p[0] == '\\'); + return p[1] == '\0' ? p + 1 : p + 2; +} -static char *eat_alpha_numeric(CONF *conf, char *p) { +static char *eat_name(char *p) { for (;;) { - if (IS_ESC(conf, *p)) { - p = scan_esc(conf, p); + if (is_esc(*p)) { + p = scan_esc(p); continue; } - if (!IS_ALPHA_NUMERIC_PUNCT(conf, *p)) { + if (!is_name_char(*p)) { return p; } p++; } } -static char *scan_quote(CONF *conf, char *p) { +static char *scan_quote(char *p) { int q = *p; p++; - while (!IS_EOF(conf, *p) && *p != q) { - if (IS_ESC(conf, *p)) { + while (*p != '\0' && *p != q) { + if (is_esc(*p)) { p++; - if (IS_EOF(conf, *p)) { + if (*p == '\0') { return p; } } @@ -353,28 +378,28 @@ static char *scan_quote(CONF *conf, char *p) { return p; } -static void clear_comments(CONF *conf, char *p) { +static void clear_comments(char *p) { for (;;) { - if (!IS_WS(conf, *p)) { + if (!is_conf_ws(*p)) { break; } p++; } for (;;) { - if (IS_COMMENT(conf, *p)) { + if (is_comment(*p)) { *p = '\0'; return; } - if (IS_QUOTE(conf, *p)) { - p = scan_quote(conf, p); + if (is_quote(*p)) { + p = scan_quote(p); continue; } - if (IS_ESC(conf, *p)) { - p = scan_esc(conf, p); + if (is_esc(*p)) { + p = scan_esc(p); continue; } - if (IS_EOF(conf, *p)) { + if (*p == '\0') { return; } else { p++; @@ -454,7 +479,7 @@ int NCONF_load_bio(CONF *conf, BIO *in, long *out_error_line) { // If we have bytes and the last char '\\' and // second last char is not '\\' p = &(buff->data[bufnum - 1]); - if (IS_ESC(conf, p[0]) && ((bufnum <= 1) || !IS_ESC(conf, p[-1]))) { + if (is_esc(p[0]) && ((bufnum <= 1) || !is_esc(p[-1]))) { bufnum--; again = 1; } @@ -465,20 +490,20 @@ int NCONF_load_bio(CONF *conf, BIO *in, long *out_error_line) { bufnum = 0; buf = buff->data; - clear_comments(conf, buf); - s = eat_ws(conf, buf); - if (IS_EOF(conf, *s)) { + clear_comments(buf); + s = eat_ws(buf); + if (*s == '\0') { continue; // blank line } if (*s == '[') { char *ss; s++; - start = eat_ws(conf, s); + start = eat_ws(s); ss = start; again: - end = eat_alpha_numeric(conf, ss); - p = eat_ws(conf, end); + end = eat_name(ss); + p = eat_ws(end); if (*p != ']') { if (*p != '\0' && ss != p) { ss = p; @@ -502,27 +527,27 @@ int NCONF_load_bio(CONF *conf, BIO *in, long *out_error_line) { } else { pname = s; psection = NULL; - end = eat_alpha_numeric(conf, s); + end = eat_name(s); if ((end[0] == ':') && (end[1] == ':')) { *end = '\0'; end += 2; psection = pname; pname = end; - end = eat_alpha_numeric(conf, end); + end = eat_name(end); } - p = eat_ws(conf, end); + p = eat_ws(end); if (*p != '=') { OPENSSL_PUT_ERROR(CONF, CONF_R_MISSING_EQUAL_SIGN); goto err; } *end = '\0'; p++; - start = eat_ws(conf, p); - while (!IS_EOF(conf, *p)) { + start = eat_ws(p); + while (*p != '\0') { p++; } p--; - while ((p != start) && (IS_WS(conf, *p))) { + while (p != start && is_conf_ws(*p)) { p--; } p++; diff --git a/Sources/CCryptoBoringSSL/crypto/conf/conf_def.h b/Sources/CCryptoBoringSSL/crypto/conf/conf_def.h deleted file mode 100644 index d2c285ae..00000000 --- a/Sources/CCryptoBoringSSL/crypto/conf/conf_def.h +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -// This file was historically generated by keysets.pl in OpenSSL. -// -// TODO(davidben): Replace it with something more readable. - -#define CONF_NUMBER 1 -#define CONF_UPPER 2 -#define CONF_LOWER 4 -#define CONF_UNDER 256 -#define CONF_PUNCTUATION 512 -#define CONF_WS 16 -#define CONF_ESC 32 -#define CONF_QUOTE 64 -#define CONF_COMMENT 128 -#define CONF_EOF 8 -#define CONF_HIGHBIT 4096 -#define CONF_ALPHA (CONF_UPPER|CONF_LOWER) -#define CONF_ALPHA_NUMERIC (CONF_ALPHA|CONF_NUMBER|CONF_UNDER) -#define CONF_ALPHA_NUMERIC_PUNCT (CONF_ALPHA|CONF_NUMBER|CONF_UNDER| \ - CONF_PUNCTUATION) - -#define KEYTYPES(c) CONF_type_default -#define IS_COMMENT(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_COMMENT) -#define IS_EOF(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_EOF) -#define IS_ESC(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_ESC) -#define IS_NUMBER(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_NUMBER) -#define IS_WS(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_WS) -#define IS_ALPHA_NUMERIC(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_ALPHA_NUMERIC) -#define IS_ALPHA_NUMERIC_PUNCT(c,a) \ - (KEYTYPES(c)[(a)&0xff]&CONF_ALPHA_NUMERIC_PUNCT) -#define IS_QUOTE(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_QUOTE) - -static const unsigned short CONF_type_default[256]={ - 0x0008,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0010,0x0010,0x0000,0x0000,0x0010,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0010,0x0200,0x0040,0x0080,0x0000,0x0200,0x0200,0x0040, - 0x0000,0x0000,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200, - 0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001, - 0x0001,0x0001,0x0000,0x0200,0x0000,0x0000,0x0000,0x0200, - 0x0200,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002, - 0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002, - 0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002, - 0x0002,0x0002,0x0002,0x0000,0x0020,0x0000,0x0200,0x0100, - 0x0040,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004, - 0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004, - 0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004, - 0x0004,0x0004,0x0004,0x0000,0x0200,0x0000,0x0200,0x0000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - }; diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_apple.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_apple.c index b478e41b..6e90c4c3 100644 --- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_apple.c +++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_apple.c @@ -15,7 +15,7 @@ #include "internal.h" #if defined(OPENSSL_AARCH64) && defined(OPENSSL_APPLE) && \ - !defined(OPENSSL_STATIC_ARMCAP) + !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM) #include #include diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_fuchsia.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_fuchsia.c index fd36f097..36709b06 100644 --- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_fuchsia.c +++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_fuchsia.c @@ -15,7 +15,7 @@ #include "internal.h" #if defined(OPENSSL_AARCH64) && defined(OPENSSL_FUCHSIA) && \ - !defined(OPENSSL_STATIC_ARMCAP) + !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM) #include #include diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_linux.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_linux.c index 43e418b3..388a032b 100644 --- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_linux.c +++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_linux.c @@ -15,7 +15,7 @@ #include "internal.h" #if defined(OPENSSL_AARCH64) && defined(OPENSSL_LINUX) && \ - !defined(OPENSSL_STATIC_ARMCAP) + !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM) #include diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_openbsd.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_openbsd.c index 09b19738..e44a66af 100644 --- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_openbsd.c +++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_openbsd.c @@ -15,11 +15,11 @@ #include #if defined(OPENSSL_AARCH64) && defined(OPENSSL_OPENBSD) && \ - !defined(OPENSSL_STATIC_ARMCAP) + !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM) -#include -#include #include +#include +#include #include @@ -27,7 +27,7 @@ void OPENSSL_cpuid_setup(void) { - int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + int isar0_mib[] = {CTL_MACHDEP, CPU_ID_AA64ISAR0}; uint64_t cpu_id = 0; size_t len = sizeof(cpu_id); diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_sysreg.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_sysreg.c index 92c533a5..4d04f778 100644 --- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_sysreg.c +++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_sysreg.c @@ -18,7 +18,8 @@ // expects userspace to simply read them. It traps the reads and fills in CPU // capabilities. #if defined(OPENSSL_AARCH64) && !defined(OPENSSL_STATIC_ARMCAP) && \ - (defined(ANDROID_BAREMETAL) || defined(OPENSSL_FREEBSD)) + (defined(ANDROID_BAREMETAL) || defined(OPENSSL_FREEBSD)) && \ + !defined(OPENSSL_NO_ASM) #include diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_win.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_win.c index 60e29451..3d9fc0f1 100644 --- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_win.c +++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_win.c @@ -16,7 +16,7 @@ #include "internal.h" #if defined(OPENSSL_AARCH64) && defined(OPENSSL_WINDOWS) && \ - !defined(OPENSSL_STATIC_ARMCAP) + !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM) #include diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_arm_linux.c b/Sources/CCryptoBoringSSL/crypto/cpu_arm_linux.c index 44f5f2ad..2921df86 100644 --- a/Sources/CCryptoBoringSSL/crypto/cpu_arm_linux.c +++ b/Sources/CCryptoBoringSSL/crypto/cpu_arm_linux.c @@ -143,6 +143,9 @@ void OPENSSL_cpuid_setup(void) { int CRYPTO_has_broken_NEON(void) { return 0; } -int CRYPTO_needs_hwcap2_workaround(void) { return g_needs_hwcap2_workaround; } +int CRYPTO_needs_hwcap2_workaround(void) { + OPENSSL_init_cpuid(); + return g_needs_hwcap2_workaround; +} #endif // OPENSSL_ARM && OPENSSL_LINUX && !OPENSSL_STATIC_ARMCAP diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_intel.c b/Sources/CCryptoBoringSSL/crypto/cpu_intel.c index b34c483c..d5c7b8ba 100644 --- a/Sources/CCryptoBoringSSL/crypto/cpu_intel.c +++ b/Sources/CCryptoBoringSSL/crypto/cpu_intel.c @@ -173,20 +173,21 @@ void OPENSSL_cpuid_setup(void) { OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1); - if (is_amd) { - // See https://www.amd.com/system/files/TechDocs/25481.pdf, page 10. - const uint32_t base_family = (eax >> 8) & 15; - const uint32_t base_model = (eax >> 4) & 15; - - uint32_t family = base_family; - uint32_t model = base_model; - if (base_family == 0xf) { - const uint32_t ext_family = (eax >> 20) & 255; - family += ext_family; - const uint32_t ext_model = (eax >> 16) & 15; - model |= ext_model << 4; - } + const uint32_t base_family = (eax >> 8) & 15; + const uint32_t base_model = (eax >> 4) & 15; + + uint32_t family = base_family; + uint32_t model = base_model; + if (base_family == 15) { + const uint32_t ext_family = (eax >> 20) & 255; + family += ext_family; + } + if (base_family == 6 || base_family == 15) { + const uint32_t ext_model = (eax >> 16) & 15; + model |= ext_model << 4; + } + if (is_amd) { if (family < 0x17 || (family == 0x17 && 0x70 <= model && model <= 0x7f)) { // Disable RDRAND on AMD families before 0x17 (Zen) due to reported // failures after suspend. @@ -208,15 +209,6 @@ void OPENSSL_cpuid_setup(void) { // Reserved bit #30 is repurposed to signal an Intel CPU. if (is_intel) { edx |= (1u << 30); - - // Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables - // some Silvermont-specific codepaths which perform better. See OpenSSL - // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f and - // |CRYPTO_cpu_perf_is_like_silvermont|. - if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ || - (eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) { - ecx &= ~(1u << 26); - } } else { edx &= ~(1u << 30); } @@ -236,25 +228,67 @@ void OPENSSL_cpuid_setup(void) { ecx &= ~(1u << 28); // AVX ecx &= ~(1u << 12); // FMA ecx &= ~(1u << 11); // AMD XOP - // Clear AVX2 and AVX512* bits. - // - // TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream - // doesn't clear those. See the comments in - // |CRYPTO_hardware_supports_XSAVE|. - extended_features[0] &= - ~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31)); + extended_features[0] &= ~(1u << 5); // AVX2 + extended_features[1] &= ~(1u << 9); // VAES + extended_features[1] &= ~(1u << 10); // VPCLMULQDQ } - // See Intel manual, volume 1, section 15.2. + // See Intel manual, volume 1, sections 15.2 ("Detection of AVX-512 Foundation + // Instructions") through 15.4 ("Detection of Intel AVX-512 Instruction Groups + // Operating at 256 and 128-bit Vector Lengths"). if ((xcr0 & 0xe6) != 0xe6) { - // Clear AVX512F. Note we don't touch other AVX512 extensions because they - // can be used with YMM. - extended_features[0] &= ~(1u << 16); + // Without XCR0.111xx11x, no AVX512 feature can be used. This includes ZMM + // registers, masking, SIMD registers 16-31 (even if accessed as YMM or + // XMM), and EVEX-coded instructions (even on YMM or XMM). Even if only + // XCR0.ZMM_Hi256 is missing, it isn't valid to use AVX512 features on + // shorter vectors, since AVX512 ties everything to the availability of + // 512-bit vectors. See the above-mentioned sections of the Intel manual, + // which say that *all* these XCR0 bits must be checked even when just using + // 128-bit or 256-bit vectors, and also volume 2a section 2.7.11 ("#UD + // Equations for EVEX") which says that all EVEX-coded instructions raise an + // undefined-instruction exception if any of these XCR0 bits is zero. + // + // AVX10 fixes this by reorganizing the features that used to be part of + // "AVX512" and allowing them to be used independently of 512-bit support. + // TODO: add AVX10 detection. + extended_features[0] &= ~(1u << 16); // AVX512F + extended_features[0] &= ~(1u << 17); // AVX512DQ + extended_features[0] &= ~(1u << 21); // AVX512IFMA + extended_features[0] &= ~(1u << 26); // AVX512PF + extended_features[0] &= ~(1u << 27); // AVX512ER + extended_features[0] &= ~(1u << 28); // AVX512CD + extended_features[0] &= ~(1u << 30); // AVX512BW + extended_features[0] &= ~(1u << 31); // AVX512VL + extended_features[1] &= ~(1u << 1); // AVX512VBMI + extended_features[1] &= ~(1u << 6); // AVX512VBMI2 + extended_features[1] &= ~(1u << 11); // AVX512VNNI + extended_features[1] &= ~(1u << 12); // AVX512BITALG + extended_features[1] &= ~(1u << 14); // AVX512VPOPCNTDQ } - // Disable ADX instructions on Knights Landing. See OpenSSL commit - // 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. - if ((ecx & (1u << 26)) == 0) { - extended_features[0] &= ~(1u << 19); + // Repurpose the bit for the removed MPX feature to indicate when using zmm + // registers should be avoided even when they are supported. (When set, AVX512 + // features can still be used, but only using ymm or xmm registers.) Skylake + // suffered from severe downclocking when zmm registers were used, which + // affected unrelated code running on the system, making zmm registers not too + // useful outside of benchmarks. The situation improved significantly by Ice + // Lake, but a small amount of downclocking remained. (See + // https://lore.kernel.org/linux-crypto/e8ce1146-3952-6977-1d0e-a22758e58914@intel.com/) + // We take a conservative approach of not allowing zmm registers until after + // Ice Lake and Tiger Lake, i.e. until Sapphire Rapids on the server side. + // + // AMD CPUs, which support AVX512 starting with Zen 4, have not been reported + // to have any downclocking problem when zmm registers are used. + if (is_intel && family == 6 && + (model == 85 || // Skylake, Cascade Lake, Cooper Lake (server) + model == 106 || // Ice Lake (server) + model == 108 || // Ice Lake (micro server) + model == 125 || // Ice Lake (client) + model == 126 || // Ice Lake (mobile) + model == 140 || // Tiger Lake (mobile) + model == 141)) { // Tiger Lake (client) + extended_features[0] |= 1u << 14; + } else { + extended_features[0] &= ~(1u << 14); } OPENSSL_ia32cap_P[0] = edx; diff --git a/Sources/CCryptoBoringSSL/crypto/crypto.c b/Sources/CCryptoBoringSSL/crypto/crypto.c index ac75d723..9a4f440e 100644 --- a/Sources/CCryptoBoringSSL/crypto/crypto.c +++ b/Sources/CCryptoBoringSSL/crypto/crypto.c @@ -16,31 +16,14 @@ #include -#include "fipsmodule/rand/fork_detect.h" #include "fipsmodule/rand/internal.h" +#include "bcm_support.h" #include "internal.h" static_assert(sizeof(ossl_ssize_t) == sizeof(size_t), "ossl_ssize_t should be the same size as size_t"); -#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_STATIC_ARMCAP) && \ - (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ - defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) -// x86, x86_64, and the ARMs need to record the result of a cpuid/getauxval call -// for the asm to work correctly, unless compiled without asm code. -#define NEED_CPUID - -#else - -// Otherwise, don't emit a static initialiser. - -#if !defined(BORINGSSL_NO_STATIC_INITIALIZER) -#define BORINGSSL_NO_STATIC_INITIALIZER -#endif - -#endif // !NO_ASM && !STATIC_ARMCAP && (X86 || X86_64 || ARM || AARCH64) - // Our assembly does not use the GOT to reference symbols, which means // references to visible symbols will often require a TEXTREL. This is @@ -79,7 +62,7 @@ HIDDEN uint8_t BORINGSSL_function_hit[7] = {0}; HIDDEN uint32_t OPENSSL_ia32cap_P[4] = {0}; uint32_t OPENSSL_get_ia32cap(int idx) { - CRYPTO_library_init(); + OPENSSL_init_cpuid(); return OPENSSL_ia32cap_P[idx]; } @@ -121,60 +104,24 @@ HIDDEN uint32_t OPENSSL_armcap_P = HIDDEN uint32_t OPENSSL_armcap_P = 0; uint32_t *OPENSSL_get_armcap_pointer_for_test(void) { - CRYPTO_library_init(); + OPENSSL_init_cpuid(); return &OPENSSL_armcap_P; } #endif uint32_t OPENSSL_get_armcap(void) { - CRYPTO_library_init(); + OPENSSL_init_cpuid(); return OPENSSL_armcap_P; } #endif -#if defined(BORINGSSL_FIPS) -// In FIPS mode, the power-on self-test function calls |CRYPTO_library_init| -// because we have to ensure that CPUID detection occurs first. -#define BORINGSSL_NO_STATIC_INITIALIZER -#endif - -#if defined(OPENSSL_WINDOWS) && !defined(BORINGSSL_NO_STATIC_INITIALIZER) -#define OPENSSL_CDECL __cdecl -#else -#define OPENSSL_CDECL -#endif - -#if defined(BORINGSSL_NO_STATIC_INITIALIZER) -static CRYPTO_once_t once = CRYPTO_ONCE_INIT; -#elif defined(_MSC_VER) -#pragma section(".CRT$XCU", read) -static void __cdecl do_library_init(void); -__declspec(allocate(".CRT$XCU")) void(*library_init_constructor)(void) = - do_library_init; -#else -static void do_library_init(void) __attribute__ ((constructor)); -#endif - -// do_library_init is the actual initialization function. If -// BORINGSSL_NO_STATIC_INITIALIZER isn't defined, this is set as a static -// initializer. Otherwise, it is called by CRYPTO_library_init. -static void OPENSSL_CDECL do_library_init(void) { - // WARNING: this function may only configure the capability variables. See the - // note above about the linker bug. #if defined(NEED_CPUID) - OPENSSL_cpuid_setup(); +static CRYPTO_once_t once = CRYPTO_ONCE_INIT; +void OPENSSL_init_cpuid(void) { CRYPTO_once(&once, OPENSSL_cpuid_setup); } #endif -} -void CRYPTO_library_init(void) { - // TODO(davidben): It would be tidier if this build knob could be replaced - // with an internal lazy-init mechanism that would handle things correctly - // in-library. https://crbug.com/542879 -#if defined(BORINGSSL_NO_STATIC_INITIALIZER) - CRYPTO_once(&once, do_library_init); -#endif -} +void CRYPTO_library_init(void) {} int CRYPTO_is_confidential_build(void) { #if defined(BORINGSSL_CONFIDENTIAL) @@ -194,7 +141,7 @@ int CRYPTO_has_asm(void) { void CRYPTO_pre_sandbox_init(void) { // Read from /proc/cpuinfo if needed. - CRYPTO_library_init(); + OPENSSL_init_cpuid(); // Open /dev/urandom if needed. CRYPTO_init_sysrand(); // Set up MADV_WIPEONFORK state if needed. @@ -235,7 +182,6 @@ int ENGINE_register_all_complete(void) { return 1; } void OPENSSL_load_builtin_modules(void) {} int OPENSSL_init_crypto(uint64_t opts, const OPENSSL_INIT_SETTINGS *settings) { - CRYPTO_library_init(); return 1; } diff --git a/Sources/CCryptoBoringSSL/crypto/curve25519/curve25519.c b/Sources/CCryptoBoringSSL/crypto/curve25519/curve25519.c index 03894479..4cf870a5 100644 --- a/Sources/CCryptoBoringSSL/crypto/curve25519/curve25519.c +++ b/Sources/CCryptoBoringSSL/crypto/curve25519/curve25519.c @@ -81,7 +81,7 @@ typedef uint64_t fe_limb_t; #define assert_fe(f) \ do { \ for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ - assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc)); \ + declassify_assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc)); \ } \ } while (0) @@ -98,7 +98,7 @@ typedef uint64_t fe_limb_t; #define assert_fe_loose(f) \ do { \ for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ - assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \ + declassify_assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \ } \ } while (0) @@ -120,8 +120,8 @@ typedef uint32_t fe_limb_t; #define assert_fe(f) \ do { \ for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ - assert(f[_assert_fe_i] <= \ - ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \ + declassify_assert(f[_assert_fe_i] <= \ + ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \ } \ } while (0) @@ -138,8 +138,8 @@ typedef uint32_t fe_limb_t; #define assert_fe_loose(f) \ do { \ for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ - assert(f[_assert_fe_i] <= \ - ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \ + declassify_assert(f[_assert_fe_i] <= \ + ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \ } \ } while (0) @@ -150,7 +150,7 @@ static_assert(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS, static void fe_frombytes_strict(fe *h, const uint8_t s[32]) { // |fiat_25519_from_bytes| requires the top-most bit be clear. - assert((s[31] & 0x80) == 0); + declassify_assert((s[31] & 0x80) == 0); fiat_25519_from_bytes(h->v, s); assert_fe(h->v); } diff --git a/Sources/CCryptoBoringSSL/crypto/digest_extra/digest_extra.c b/Sources/CCryptoBoringSSL/crypto/digest_extra/digest_extra.c index 6bc495d2..c6a9d443 100644 --- a/Sources/CCryptoBoringSSL/crypto/digest_extra/digest_extra.c +++ b/Sources/CCryptoBoringSSL/crypto/digest_extra/digest_extra.c @@ -61,6 +61,8 @@ #include #include #include +#include +#include #include #include "../asn1/internal.h" @@ -220,6 +222,7 @@ int EVP_marshal_digest_algorithm(CBB *cbb, const EVP_MD *md) { return 0; } + // TODO(crbug.com/boringssl/710): Is this correct? See RFC 4055, section 2.1. if (!CBB_add_asn1(&algorithm, &null, CBS_ASN1_NULL) || !CBB_flush(cbb)) { return 0; @@ -263,3 +266,90 @@ static const EVP_MD evp_md_blake2b256 = { }; const EVP_MD *EVP_blake2b256(void) { return &evp_md_blake2b256; } + + +static void md4_init(EVP_MD_CTX *ctx) { + BSSL_CHECK(MD4_Init(ctx->md_data)); +} + +static void md4_update(EVP_MD_CTX *ctx, const void *data, size_t count) { + BSSL_CHECK(MD4_Update(ctx->md_data, data, count)); +} + +static void md4_final(EVP_MD_CTX *ctx, uint8_t *out) { + BSSL_CHECK(MD4_Final(out, ctx->md_data)); +} + +static const EVP_MD evp_md_md4 = { + NID_md4, + MD4_DIGEST_LENGTH, + 0, + md4_init, + md4_update, + md4_final, + 64, + sizeof(MD4_CTX), +}; + +const EVP_MD *EVP_md4(void) { return &evp_md_md4; } + +static void md5_init(EVP_MD_CTX *ctx) { + BSSL_CHECK(MD5_Init(ctx->md_data)); +} + +static void md5_update(EVP_MD_CTX *ctx, const void *data, size_t count) { + BSSL_CHECK(MD5_Update(ctx->md_data, data, count)); +} + +static void md5_final(EVP_MD_CTX *ctx, uint8_t *out) { + BSSL_CHECK(MD5_Final(out, ctx->md_data)); +} + +static const EVP_MD evp_md_md5 = { + NID_md5, + MD5_DIGEST_LENGTH, + 0, + md5_init, + md5_update, + md5_final, + 64, + sizeof(MD5_CTX), +}; + +const EVP_MD *EVP_md5(void) { return &evp_md_md5; } + +typedef struct { + MD5_CTX md5; + SHA_CTX sha1; +} MD5_SHA1_CTX; + +static void md5_sha1_init(EVP_MD_CTX *md_ctx) { + MD5_SHA1_CTX *ctx = md_ctx->md_data; + BSSL_CHECK(MD5_Init(&ctx->md5) && SHA1_Init(&ctx->sha1)); +} + +static void md5_sha1_update(EVP_MD_CTX *md_ctx, const void *data, + size_t count) { + MD5_SHA1_CTX *ctx = md_ctx->md_data; + BSSL_CHECK(MD5_Update(&ctx->md5, data, count) && + SHA1_Update(&ctx->sha1, data, count)); +} + +static void md5_sha1_final(EVP_MD_CTX *md_ctx, uint8_t *out) { + MD5_SHA1_CTX *ctx = md_ctx->md_data; + BSSL_CHECK(MD5_Final(out, &ctx->md5) && + SHA1_Final(out + MD5_DIGEST_LENGTH, &ctx->sha1)); +} + +const EVP_MD evp_md_md5_sha1 = { + NID_md5_sha1, + MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH, + 0, + md5_sha1_init, + md5_sha1_update, + md5_sha1_final, + 64, + sizeof(MD5_SHA1_CTX), +}; + +const EVP_MD *EVP_md5_sha1(void) { return &evp_md_md5_sha1; } diff --git a/Sources/CCryptoBoringSSL/crypto/dilithium/dilithium.c b/Sources/CCryptoBoringSSL/crypto/dilithium/dilithium.c new file mode 100644 index 00000000..770ed765 --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/dilithium/dilithium.c @@ -0,0 +1,1539 @@ +/* Copyright (c) 2023, Google LLC + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#define OPENSSL_UNSTABLE_EXPERIMENTAL_DILITHIUM +#include + +#include +#include + +#include +#include + +#include "../internal.h" +#include "../keccak/internal.h" +#include "./internal.h" + +#define DEGREE 256 +#define K 6 +#define L 5 +#define ETA 4 +#define TAU 49 +#define BETA 196 +#define OMEGA 55 + +#define RHO_BYTES 32 +#define SIGMA_BYTES 64 +#define K_BYTES 32 +#define TR_BYTES 64 +#define MU_BYTES 64 +#define RHO_PRIME_BYTES 64 +#define LAMBDA_BITS 192 +#define LAMBDA_BYTES (LAMBDA_BITS / 8) + +// 2^23 - 2^13 + 1 +static const uint32_t kPrime = 8380417; +// Inverse of -kPrime modulo 2^32 +static const uint32_t kPrimeNegInverse = 4236238847; +static const int kDroppedBits = 13; +static const uint32_t kHalfPrime = (8380417 - 1) / 2; +static const uint32_t kGamma1 = 1 << 19; +static const uint32_t kGamma2 = (8380417 - 1) / 32; +// 256^-1 mod kPrime, in Montgomery form. +static const uint32_t kInverseDegreeMontgomery = 41978; + +typedef struct scalar { + uint32_t c[DEGREE]; +} scalar; + +typedef struct vectork { + scalar v[K]; +} vectork; + +typedef struct vectorl { + scalar v[L]; +} vectorl; + +typedef struct matrix { + scalar v[K][L]; +} matrix; + +/* Arithmetic */ + +// This bit of Python will be referenced in some of the following comments: +// +// q = 8380417 +// # Inverse of -q modulo 2^32 +// q_neg_inverse = 4236238847 +// # 2^64 modulo q +// montgomery_square = 2365951 +// +// def bitreverse(i): +// ret = 0 +// for n in range(8): +// bit = i & 1 +// ret <<= 1 +// ret |= bit +// i >>= 1 +// return ret +// +// def montgomery_reduce(x): +// a = (x * q_neg_inverse) % 2**32 +// b = x + a * q +// assert b & 0xFFFF_FFFF == 0 +// c = b >> 32 +// assert c < q +// return c +// +// def montgomery_transform(x): +// return montgomery_reduce(x * montgomery_square) + +// kNTTRootsMontgomery = [ +// montgomery_transform(pow(1753, bitreverse(i), q)) for i in range(256) +// ] +static const uint32_t kNTTRootsMontgomery[256] = { + 4193792, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, + 1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, + 2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, + 6262231, 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005, + 2706023, 95776, 3077325, 3530437, 6718724, 4788269, 5842901, 3915439, + 4519302, 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118, + 6681150, 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596, + 811944, 531354, 954230, 3881043, 3900724, 5823537, 2071892, 5582638, + 4450022, 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196, + 7122806, 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922, + 3412210, 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370, + 7709315, 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987, + 5037034, 264944, 508951, 3097992, 44288, 7280319, 904516, 3958618, + 4656075, 8371839, 1653064, 5130689, 2389356, 8169440, 759969, 7063561, + 189548, 4827145, 3159746, 6529015, 5971092, 8202977, 1315589, 1341330, + 1285669, 6795489, 7567685, 6940675, 5361315, 4499357, 4751448, 3839961, + 2091667, 3407706, 2316500, 3817976, 5037939, 2244091, 5933984, 4817955, + 266997, 2434439, 7144689, 3513181, 4860065, 4621053, 7183191, 5187039, + 900702, 1859098, 909542, 819034, 495491, 6767243, 8337157, 7857917, + 7725090, 5257975, 2031748, 3207046, 4823422, 7855319, 7611795, 4784579, + 342297, 286988, 5942594, 4108315, 3437287, 5038140, 1735879, 203044, + 2842341, 2691481, 5790267, 1265009, 4055324, 1247620, 2486353, 1595974, + 4613401, 1250494, 2635921, 4832145, 5386378, 1869119, 1903435, 7329447, + 7047359, 1237275, 5062207, 6950192, 7929317, 1312455, 3306115, 6417775, + 7100756, 1917081, 5834105, 7005614, 1500165, 777191, 2235880, 3406031, + 7838005, 5548557, 6709241, 6533464, 5796124, 4656147, 594136, 4603424, + 6366809, 2432395, 2454455, 8215696, 1957272, 3369112, 185531, 7173032, + 5196991, 162844, 1616392, 3014001, 810149, 1652634, 4686184, 6581310, + 5341501, 3523897, 3866901, 269760, 2213111, 7404533, 1717735, 472078, + 7953734, 1723600, 6577327, 1910376, 6712985, 7276084, 8119771, 4546524, + 5441381, 6144432, 7959518, 6094090, 183443, 7403526, 1612842, 4834730, + 7826001, 3919660, 8332111, 7018208, 3937738, 1400424, 7534263, 1976782}; + +// Reduces x mod kPrime in constant time, where 0 <= x < 2*kPrime. +static uint32_t reduce_once(uint32_t x) { + declassify_assert(x < 2 * kPrime); + // return x < kPrime ? x : x - kPrime; + return constant_time_select_int(constant_time_lt_w(x, kPrime), x, x - kPrime); +} + +// Returns the absolute value in constant time. +static uint32_t abs_signed(uint32_t x) { + // return is_positive(x) ? x : -x; + // Note: MSVC doesn't like applying the unary minus operator to unsigned types + // (warning C4146), so we write the negation as a bitwise not plus one + // (assuming two's complement representation). + return constant_time_select_int(constant_time_lt_w(x, 0x80000000), x, ~x + 1); +} + +// Returns the absolute value modulo kPrime. +static uint32_t abs_mod_prime(uint32_t x) { + declassify_assert(x < kPrime); + // return x > kHalfPrime ? kPrime - x : x; + return constant_time_select_int(constant_time_lt_w(kHalfPrime, x), kPrime - x, + x); +} + +// Returns the maximum of two values in constant time. +static uint32_t maximum(uint32_t x, uint32_t y) { + // return x < y ? y : x; + return constant_time_select_int(constant_time_lt_w(x, y), y, x); +} + +static void scalar_add(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = reduce_once(lhs->c[i] + rhs->c[i]); + } +} + +static void scalar_sub(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = reduce_once(kPrime + lhs->c[i] - rhs->c[i]); + } +} + +static uint32_t reduce_montgomery(uint64_t x) { + uint64_t a = (uint32_t)x * kPrimeNegInverse; + uint64_t b = x + a * kPrime; + declassify_assert((b & 0xffffffff) == 0); + uint32_t c = b >> 32; + return reduce_once(c); +} + +// Multiply two scalars in the number theoretically transformed state. +static void scalar_mult(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = reduce_montgomery((uint64_t)lhs->c[i] * (uint64_t)rhs->c[i]); + } +} + +// In place number theoretic transform of a given scalar. +// +// FIPS 204, Algorithm 35 (`NTT`). +static void scalar_ntt(scalar *s) { + // Step: 1, 2, 4, 8, ..., 128 + // Offset: 128, 64, 32, 16, ..., 1 + int offset = DEGREE; + for (int step = 1; step < DEGREE; step <<= 1) { + offset >>= 1; + int k = 0; + for (int i = 0; i < step; i++) { + assert(k == 2 * offset * i); + const uint32_t step_root = kNTTRootsMontgomery[step + i]; + for (int j = k; j < k + offset; j++) { + uint32_t even = s->c[j]; + uint32_t odd = + reduce_montgomery((uint64_t)step_root * (uint64_t)s->c[j + offset]); + s->c[j] = reduce_once(odd + even); + s->c[j + offset] = reduce_once(kPrime + even - odd); + } + k += 2 * offset; + } + } +} + +// In place inverse number theoretic transform of a given scalar. +// +// FIPS 204, Algorithm 36 (`NTT^-1`). +static void scalar_inverse_ntt(scalar *s) { + // Step: 128, 64, 32, 16, ..., 1 + // Offset: 1, 2, 4, 8, ..., 128 + int step = DEGREE; + for (int offset = 1; offset < DEGREE; offset <<= 1) { + step >>= 1; + int k = 0; + for (int i = 0; i < step; i++) { + assert(k == 2 * offset * i); + const uint32_t step_root = + kPrime - kNTTRootsMontgomery[step + (step - 1 - i)]; + for (int j = k; j < k + offset; j++) { + uint32_t even = s->c[j]; + uint32_t odd = s->c[j + offset]; + s->c[j] = reduce_once(odd + even); + s->c[j + offset] = reduce_montgomery((uint64_t)step_root * + (uint64_t)(kPrime + even - odd)); + } + k += 2 * offset; + } + } + for (int i = 0; i < DEGREE; i++) { + s->c[i] = reduce_montgomery((uint64_t)s->c[i] * + (uint64_t)kInverseDegreeMontgomery); + } +} + +static void vectork_zero(vectork *out) { OPENSSL_memset(out, 0, sizeof(*out)); } + +static void vectork_add(vectork *out, const vectork *lhs, const vectork *rhs) { + for (int i = 0; i < K; i++) { + scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]); + } +} + +static void vectork_sub(vectork *out, const vectork *lhs, const vectork *rhs) { + for (int i = 0; i < K; i++) { + scalar_sub(&out->v[i], &lhs->v[i], &rhs->v[i]); + } +} + +static void vectork_mult_scalar(vectork *out, const vectork *lhs, + const scalar *rhs) { + for (int i = 0; i < K; i++) { + scalar_mult(&out->v[i], &lhs->v[i], rhs); + } +} + +static void vectork_ntt(vectork *a) { + for (int i = 0; i < K; i++) { + scalar_ntt(&a->v[i]); + } +} + +static void vectork_inverse_ntt(vectork *a) { + for (int i = 0; i < K; i++) { + scalar_inverse_ntt(&a->v[i]); + } +} + +static void vectorl_add(vectorl *out, const vectorl *lhs, const vectorl *rhs) { + for (int i = 0; i < L; i++) { + scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]); + } +} + +static void vectorl_mult_scalar(vectorl *out, const vectorl *lhs, + const scalar *rhs) { + for (int i = 0; i < L; i++) { + scalar_mult(&out->v[i], &lhs->v[i], rhs); + } +} + +static void vectorl_ntt(vectorl *a) { + for (int i = 0; i < L; i++) { + scalar_ntt(&a->v[i]); + } +} + +static void vectorl_inverse_ntt(vectorl *a) { + for (int i = 0; i < L; i++) { + scalar_inverse_ntt(&a->v[i]); + } +} + +static void matrix_mult(vectork *out, const matrix *m, const vectorl *a) { + vectork_zero(out); + for (int i = 0; i < K; i++) { + for (int j = 0; j < L; j++) { + scalar product; + scalar_mult(&product, &m->v[i][j], &a->v[j]); + scalar_add(&out->v[i], &out->v[i], &product); + } + } +} + +/* Rounding & hints */ + +// FIPS 204, Algorithm 29 (`Power2Round`). +static void power2_round(uint32_t *r1, uint32_t *r0, uint32_t r) { + *r1 = r >> kDroppedBits; + *r0 = r - (*r1 << kDroppedBits); + + uint32_t r0_adjusted = reduce_once(kPrime + *r0 - (1 << kDroppedBits)); + uint32_t r1_adjusted = *r1 + 1; + + // Mask is set iff r0 > 2^(dropped_bits - 1). + crypto_word_t mask = + constant_time_lt_w((uint32_t)(1 << (kDroppedBits - 1)), *r0); + // r0 = mask ? r0_adjusted : r0 + *r0 = constant_time_select_int(mask, r0_adjusted, *r0); + // r1 = mask ? r1_adjusted : r1 + *r1 = constant_time_select_int(mask, r1_adjusted, *r1); +} + +// Scale back previously rounded value. +static void scale_power2_round(uint32_t *out, uint32_t r1) { + // Pre-condition: 0 <= r1 <= 2^10 - 1 + *out = r1 << kDroppedBits; + // Post-condition: 0 <= out <= 2^23 - 2^13 = kPrime - 1 + assert(*out < kPrime); +} + +// FIPS 204, Algorithm 31 (`HighBits`). +static uint32_t high_bits(uint32_t x) { + // Reference description (given 0 <= x < q): + // + // ``` + // int32_t r0 = x mod+- (2 * kGamma2); + // if (x - r0 == q - 1) { + // return 0; + // } else { + // return (x - r0) / (2 * kGamma2); + // } + // ``` + // + // Below is the formula taken from the reference implementation. + // + // Here, kGamma2 == 2^18 - 2^8 + // This returns ((ceil(x / 2^7) * (2^10 + 1) + 2^21) / 2^22) mod 2^4 + uint32_t r1 = (x + 127) >> 7; + r1 = (r1 * 1025 + (1 << 21)) >> 22; + r1 &= 15; + return r1; +} + +// FIPS 204, Algorithm 30 (`Decompose`). +static void decompose(uint32_t *r1, int32_t *r0, uint32_t r) { + *r1 = high_bits(r); + + *r0 = r; + *r0 -= *r1 * 2 * (int32_t)kGamma2; + *r0 -= (((int32_t)kHalfPrime - *r0) >> 31) & (int32_t)kPrime; +} + +// FIPS 204, Algorithm 32 (`LowBits`). +static int32_t low_bits(uint32_t x) { + uint32_t r1; + int32_t r0; + decompose(&r1, &r0, x); + return r0; +} + +// FIPS 204, Algorithm 33 (`MakeHint`). +static int32_t make_hint(uint32_t ct0, uint32_t cs2, uint32_t w) { + uint32_t r_plus_z = reduce_once(kPrime + w - cs2); + uint32_t r = reduce_once(r_plus_z + ct0); + return high_bits(r) != high_bits(r_plus_z); +} + +// FIPS 204, Algorithm 34 (`UseHint`). +static uint32_t use_hint_vartime(uint32_t h, uint32_t r) { + uint32_t r1; + int32_t r0; + decompose(&r1, &r0, r); + + if (h) { + if (r0 > 0) { + return (r1 + 1) & 15; + } else { + return (r1 - 1) & 15; + } + } else { + return r1; + } +} + +static void scalar_power2_round(scalar *s1, scalar *s0, const scalar *s) { + for (int i = 0; i < DEGREE; i++) { + power2_round(&s1->c[i], &s0->c[i], s->c[i]); + } +} + +static void scalar_scale_power2_round(scalar *out, const scalar *in) { + for (int i = 0; i < DEGREE; i++) { + scale_power2_round(&out->c[i], in->c[i]); + } +} + +static void scalar_high_bits(scalar *out, const scalar *in) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = high_bits(in->c[i]); + } +} + +static void scalar_low_bits(scalar *out, const scalar *in) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = low_bits(in->c[i]); + } +} + +static void scalar_max(uint32_t *max, const scalar *s) { + for (int i = 0; i < DEGREE; i++) { + uint32_t abs = abs_mod_prime(s->c[i]); + *max = maximum(*max, abs); + } +} + +static void scalar_max_signed(uint32_t *max, const scalar *s) { + for (int i = 0; i < DEGREE; i++) { + uint32_t abs = abs_signed(s->c[i]); + *max = maximum(*max, abs); + } +} + +static void scalar_make_hint(scalar *out, const scalar *ct0, const scalar *cs2, + const scalar *w) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = make_hint(ct0->c[i], cs2->c[i], w->c[i]); + } +} + +static void scalar_use_hint_vartime(scalar *out, const scalar *h, + const scalar *r) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = use_hint_vartime(h->c[i], r->c[i]); + } +} + +static void vectork_power2_round(vectork *t1, vectork *t0, const vectork *t) { + for (int i = 0; i < K; i++) { + scalar_power2_round(&t1->v[i], &t0->v[i], &t->v[i]); + } +} + +static void vectork_scale_power2_round(vectork *out, const vectork *in) { + for (int i = 0; i < K; i++) { + scalar_scale_power2_round(&out->v[i], &in->v[i]); + } +} + +static void vectork_high_bits(vectork *out, const vectork *in) { + for (int i = 0; i < K; i++) { + scalar_high_bits(&out->v[i], &in->v[i]); + } +} + +static void vectork_low_bits(vectork *out, const vectork *in) { + for (int i = 0; i < K; i++) { + scalar_low_bits(&out->v[i], &in->v[i]); + } +} + +static uint32_t vectork_max(const vectork *a) { + uint32_t max = 0; + for (int i = 0; i < K; i++) { + scalar_max(&max, &a->v[i]); + } + return max; +} + +static uint32_t vectork_max_signed(const vectork *a) { + uint32_t max = 0; + for (int i = 0; i < K; i++) { + scalar_max_signed(&max, &a->v[i]); + } + return max; +} + +// The input vector contains only zeroes and ones. +static size_t vectork_count_ones(const vectork *a) { + size_t count = 0; + for (int i = 0; i < K; i++) { + for (int j = 0; j < DEGREE; j++) { + count += a->v[i].c[j]; + } + } + return count; +} + +static void vectork_make_hint(vectork *out, const vectork *ct0, + const vectork *cs2, const vectork *w) { + for (int i = 0; i < K; i++) { + scalar_make_hint(&out->v[i], &ct0->v[i], &cs2->v[i], &w->v[i]); + } +} + +static void vectork_use_hint_vartime(vectork *out, const vectork *h, + const vectork *r) { + for (int i = 0; i < K; i++) { + scalar_use_hint_vartime(&out->v[i], &h->v[i], &r->v[i]); + } +} + +static uint32_t vectorl_max(const vectorl *a) { + uint32_t max = 0; + for (int i = 0; i < L; i++) { + scalar_max(&max, &a->v[i]); + } + return max; +} + +/* Bit packing */ + +static const uint8_t kMasks[8] = {0x01, 0x03, 0x07, 0x0f, + 0x1f, 0x3f, 0x7f, 0xff}; + +// FIPS 204, Algorithm 10 (`SimpleBitPack`). +static void scalar_encode(uint8_t *out, const scalar *s, int bits) { + assert(bits <= (int)sizeof(*s->c) * 8 && bits != 1); + + uint8_t out_byte = 0; + int out_byte_bits = 0; + + for (int i = 0; i < DEGREE; i++) { + uint32_t element = s->c[i]; + int element_bits_done = 0; + + while (element_bits_done < bits) { + int chunk_bits = bits - element_bits_done; + int out_bits_remaining = 8 - out_byte_bits; + if (chunk_bits >= out_bits_remaining) { + chunk_bits = out_bits_remaining; + out_byte |= (element & kMasks[chunk_bits - 1]) << out_byte_bits; + *out = out_byte; + out++; + out_byte_bits = 0; + out_byte = 0; + } else { + out_byte |= (element & kMasks[chunk_bits - 1]) << out_byte_bits; + out_byte_bits += chunk_bits; + } + + element_bits_done += chunk_bits; + element >>= chunk_bits; + } + } + + if (out_byte_bits > 0) { + *out = out_byte; + } +} + +// FIPS 204, Algorithm 11 (`BitPack`). +static void scalar_encode_signed(uint8_t *out, const scalar *s, int bits, + uint32_t max) { + assert(bits <= (int)sizeof(*s->c) * 8 && bits != 1); + + uint8_t out_byte = 0; + int out_byte_bits = 0; + + for (int i = 0; i < DEGREE; i++) { + uint32_t element = reduce_once(kPrime + max - s->c[i]); + declassify_assert(element <= 2 * max); + int element_bits_done = 0; + + while (element_bits_done < bits) { + int chunk_bits = bits - element_bits_done; + int out_bits_remaining = 8 - out_byte_bits; + if (chunk_bits >= out_bits_remaining) { + chunk_bits = out_bits_remaining; + out_byte |= (element & kMasks[chunk_bits - 1]) << out_byte_bits; + *out = out_byte; + out++; + out_byte_bits = 0; + out_byte = 0; + } else { + out_byte |= (element & kMasks[chunk_bits - 1]) << out_byte_bits; + out_byte_bits += chunk_bits; + } + + element_bits_done += chunk_bits; + element >>= chunk_bits; + } + } + + if (out_byte_bits > 0) { + *out = out_byte; + } +} + +// FIPS 204, Algorithm 12 (`SimpleBitUnpack`). +static void scalar_decode(scalar *out, const uint8_t *in, int bits) { + assert(bits <= (int)sizeof(*out->c) * 8 && bits != 1); + + uint8_t in_byte = 0; + int in_byte_bits_left = 0; + + for (int i = 0; i < DEGREE; i++) { + uint32_t element = 0; + int element_bits_done = 0; + + while (element_bits_done < bits) { + if (in_byte_bits_left == 0) { + in_byte = *in; + in++; + in_byte_bits_left = 8; + } + + int chunk_bits = bits - element_bits_done; + if (chunk_bits > in_byte_bits_left) { + chunk_bits = in_byte_bits_left; + } + + element |= (in_byte & kMasks[chunk_bits - 1]) << element_bits_done; + in_byte_bits_left -= chunk_bits; + in_byte >>= chunk_bits; + + element_bits_done += chunk_bits; + } + + out->c[i] = element; + } +} + +// FIPS 204, Algorithm 13 (`BitUnpack`). +static int scalar_decode_signed(scalar *out, const uint8_t *in, int bits, + uint32_t max) { + assert(bits <= (int)sizeof(*out->c) * 8 && bits != 1); + + uint8_t in_byte = 0; + int in_byte_bits_left = 0; + + for (int i = 0; i < DEGREE; i++) { + uint32_t element = 0; + int element_bits_done = 0; + + while (element_bits_done < bits) { + if (in_byte_bits_left == 0) { + in_byte = *in; + in++; + in_byte_bits_left = 8; + } + + int chunk_bits = bits - element_bits_done; + if (chunk_bits > in_byte_bits_left) { + chunk_bits = in_byte_bits_left; + } + + element |= (in_byte & kMasks[chunk_bits - 1]) << element_bits_done; + in_byte_bits_left -= chunk_bits; + in_byte >>= chunk_bits; + + element_bits_done += chunk_bits; + } + + // This may be only out of range in cases of invalid input, in which case it + // is okay to leak the value. This function is also called with secret + // input during signing, in |scalar_sample_mask|. However, in that case + // (and in any case when |max| is a power of two), this case is impossible. + if (constant_time_declassify_int(element > 2 * max)) { + return 0; + } + out->c[i] = reduce_once(kPrime + max - element); + } + + return 1; +} + +/* Expansion functions */ + +// FIPS 204, Algorithm 24 (`RejNTTPoly`). +// +// Rejection samples a Keccak stream to get uniformly distributed elements. This +// is used for matrix expansion and only operates on public inputs. +static void scalar_from_keccak_vartime( + scalar *out, const uint8_t derived_seed[RHO_BYTES + 2]) { + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake128); + BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, RHO_BYTES + 2); + assert(keccak_ctx.squeeze_offset == 0); + assert(keccak_ctx.rate_bytes == 168); + static_assert(168 % 3 == 0, "block and coefficient boundaries do not align"); + + int done = 0; + while (done < DEGREE) { + uint8_t block[168]; + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + for (size_t i = 0; i < sizeof(block) && done < DEGREE; i += 3) { + // FIPS 204, Algorithm 8 (`CoeffFromThreeBytes`). + uint32_t value = (uint32_t)block[i] | ((uint32_t)block[i + 1] << 8) | + (((uint32_t)block[i + 2] & 0x7f) << 16); + if (value < kPrime) { + out->c[done++] = value; + } + } + } +} + +// FIPS 204, Algorithm 25 (`RejBoundedPoly`). +static void scalar_uniform_eta_4(scalar *out, + const uint8_t derived_seed[SIGMA_BYTES + 2]) { + static_assert(ETA == 4, "This implementation is specialized for ETA == 4"); + + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, SIGMA_BYTES + 2); + assert(keccak_ctx.squeeze_offset == 0); + assert(keccak_ctx.rate_bytes == 136); + + int done = 0; + while (done < DEGREE) { + uint8_t block[136]; + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + for (size_t i = 0; i < sizeof(block) && done < DEGREE; ++i) { + uint32_t t0 = block[i] & 0x0F; + uint32_t t1 = block[i] >> 4; + // FIPS 204, Algorithm 9 (`CoefFromHalfByte`). Although both the input and + // output here are secret, it is OK to leak when we rejected a byte. + // Individual bytes of the SHAKE-256 stream are (indistiguishable from) + // independent of each other and the original seed, so leaking information + // about the rejected bytes does not reveal the input or output. + if (constant_time_declassify_int(t0 < 9)) { + out->c[done++] = reduce_once(kPrime + ETA - t0); + } + if (done < DEGREE && constant_time_declassify_int(t1 < 9)) { + out->c[done++] = reduce_once(kPrime + ETA - t1); + } + } + } +} + +// FIPS 204, Algorithm 28 (`ExpandMask`). +static void scalar_sample_mask( + scalar *out, const uint8_t derived_seed[RHO_PRIME_BYTES + 2]) { + uint8_t buf[640]; + BORINGSSL_keccak(buf, sizeof(buf), derived_seed, RHO_PRIME_BYTES + 2, + boringssl_shake256); + + // Note: Decoding 20 bits into (-2^19, 2^19] cannot fail. + scalar_decode_signed(out, buf, 20, 1 << 19); +} + +// FIPS 204, Algorithm 23 (`SampleInBall`). +static void scalar_sample_in_ball_vartime(scalar *out, const uint8_t *seed, + int len) { + assert(len == 32); + + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, seed, len); + assert(keccak_ctx.squeeze_offset == 0); + assert(keccak_ctx.rate_bytes == 136); + + uint8_t block[136]; + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + + uint64_t signs = CRYPTO_load_u64_le(block); + int offset = 8; + // SampleInBall implements a Fisher–Yates shuffle, which unavoidably leaks + // where the zeros are by memory access pattern. Although this leak happens + // before bad signatures are rejected, this is safe. See + // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/8d8f01ac_70af3f21/ + CONSTTIME_DECLASSIFY(block + offset, sizeof(block) - offset); + + OPENSSL_memset(out, 0, sizeof(*out)); + for (size_t i = DEGREE - TAU; i < DEGREE; i++) { + size_t byte; + for (;;) { + if (offset == 136) { + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + // See above. + CONSTTIME_DECLASSIFY(block, sizeof(block)); + offset = 0; + } + + byte = block[offset++]; + if (byte <= i) { + break; + } + } + + out->c[i] = out->c[byte]; + out->c[byte] = reduce_once(kPrime + 1 - 2 * (signs & 1)); + signs >>= 1; + } +} + +// FIPS 204, Algorithm 26 (`ExpandA`). +static void matrix_expand(matrix *out, const uint8_t rho[RHO_BYTES]) { + static_assert(K <= 0x100, "K must fit in 8 bits"); + static_assert(L <= 0x100, "L must fit in 8 bits"); + + uint8_t derived_seed[RHO_BYTES + 2]; + OPENSSL_memcpy(derived_seed, rho, RHO_BYTES); + for (int i = 0; i < K; i++) { + for (int j = 0; j < L; j++) { + derived_seed[RHO_BYTES + 1] = i; + derived_seed[RHO_BYTES] = j; + scalar_from_keccak_vartime(&out->v[i][j], derived_seed); + } + } +} + +// FIPS 204, Algorithm 27 (`ExpandS`). +static void vector_expand_short(vectorl *s1, vectork *s2, + const uint8_t sigma[SIGMA_BYTES]) { + static_assert(K <= 0x100, "K must fit in 8 bits"); + static_assert(L <= 0x100, "L must fit in 8 bits"); + static_assert(K + L <= 0x100, "K+L must fit in 8 bits"); + + uint8_t derived_seed[SIGMA_BYTES + 2]; + OPENSSL_memcpy(derived_seed, sigma, SIGMA_BYTES); + derived_seed[SIGMA_BYTES] = 0; + derived_seed[SIGMA_BYTES + 1] = 0; + for (int i = 0; i < L; i++) { + scalar_uniform_eta_4(&s1->v[i], derived_seed); + ++derived_seed[SIGMA_BYTES]; + } + for (int i = 0; i < K; i++) { + scalar_uniform_eta_4(&s2->v[i], derived_seed); + ++derived_seed[SIGMA_BYTES]; + } +} + +// FIPS 204, Algorithm 28 (`ExpandMask`). +static void vectorl_expand_mask(vectorl *out, + const uint8_t seed[RHO_PRIME_BYTES], + size_t kappa) { + assert(kappa + L <= 0x10000); + + uint8_t derived_seed[RHO_PRIME_BYTES + 2]; + OPENSSL_memcpy(derived_seed, seed, RHO_PRIME_BYTES); + for (int i = 0; i < L; i++) { + size_t index = kappa + i; + derived_seed[RHO_PRIME_BYTES] = index & 0xFF; + derived_seed[RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF; + scalar_sample_mask(&out->v[i], derived_seed); + } +} + +/* Encoding */ + +// FIPS 204, Algorithm 10 (`SimpleBitPack`). +// +// Encodes an entire vector into 32*K*|bits| bytes. Note that since 256 (DEGREE) +// is divisible by 8, the individual vector entries will always fill a whole +// number of bytes, so we do not need to worry about bit packing here. +static void vectork_encode(uint8_t *out, const vectork *a, int bits) { + for (int i = 0; i < K; i++) { + scalar_encode(out + i * bits * DEGREE / 8, &a->v[i], bits); + } +} + +// FIPS 204, Algorithm 12 (`SimpleBitUnpack`). +static void vectork_decode(vectork *out, const uint8_t *in, int bits) { + for (int i = 0; i < K; i++) { + scalar_decode(&out->v[i], in + i * bits * DEGREE / 8, bits); + } +} + +static void vectork_encode_signed(uint8_t *out, const vectork *a, int bits, + uint32_t max) { + for (int i = 0; i < K; i++) { + scalar_encode_signed(out + i * bits * DEGREE / 8, &a->v[i], bits, max); + } +} + +static int vectork_decode_signed(vectork *out, const uint8_t *in, int bits, + uint32_t max) { + for (int i = 0; i < K; i++) { + if (!scalar_decode_signed(&out->v[i], in + i * bits * DEGREE / 8, bits, + max)) { + return 0; + } + } + return 1; +} + +// FIPS 204, Algorithm 11 (`BitPack`). +// +// Encodes an entire vector into 32*L*|bits| bytes. Note that since 256 (DEGREE) +// is divisible by 8, the individual vector entries will always fill a whole +// number of bytes, so we do not need to worry about bit packing here. +static void vectorl_encode_signed(uint8_t *out, const vectorl *a, int bits, + uint32_t max) { + for (int i = 0; i < L; i++) { + scalar_encode_signed(out + i * bits * DEGREE / 8, &a->v[i], bits, max); + } +} + +static int vectorl_decode_signed(vectorl *out, const uint8_t *in, int bits, + uint32_t max) { + for (int i = 0; i < L; i++) { + if (!scalar_decode_signed(&out->v[i], in + i * bits * DEGREE / 8, bits, + max)) { + return 0; + } + } + return 1; +} + +// FIPS 204, Algorithm 22 (`w1Encode`). +// +// The output must point to an array of 128*K bytes. +static void w1_encode(uint8_t *out, const vectork *w1) { + vectork_encode(out, w1, 4); +} + +// FIPS 204, Algorithm 14 (`HintBitPack`). +static void hint_bit_pack(uint8_t *out, const vectork *h) { + OPENSSL_memset(out, 0, OMEGA + K); + int index = 0; + for (int i = 0; i < K; i++) { + for (int j = 0; j < DEGREE; j++) { + if (h->v[i].c[j]) { + out[index++] = j; + } + } + out[OMEGA + i] = index; + } +} + +// FIPS 204, Algorithm 15 (`HintBitUnpack`). +static int hint_bit_unpack(vectork *h, const uint8_t *in) { + vectork_zero(h); + int index = 0; + for (int i = 0; i < K; i++) { + int limit = in[OMEGA + i]; + if (limit < index || limit > OMEGA) { + return 0; + } + + int last = -1; + while (index < limit) { + int byte = in[index++]; + if (last >= 0 && byte <= last) { + return 0; + } + last = byte; + h->v[i].c[byte] = 1; + } + } + for (; index < OMEGA; index++) { + if (in[index] != 0) { + return 0; + } + } + return 1; +} + +struct public_key { + uint8_t rho[RHO_BYTES]; + vectork t1; + // Pre-cached value(s). + uint8_t public_key_hash[TR_BYTES]; +}; + +struct private_key { + uint8_t rho[RHO_BYTES]; + uint8_t k[K_BYTES]; + uint8_t public_key_hash[TR_BYTES]; + vectorl s1; + vectork s2; + vectork t0; +}; + +struct signature { + uint8_t c_tilde[2 * LAMBDA_BYTES]; + vectorl z; + vectork h; +}; + +// FIPS 204, Algorithm 16 (`pkEncode`). +static int dilithium_marshal_public_key(CBB *out, + const struct public_key *pub) { + if (!CBB_add_bytes(out, pub->rho, sizeof(pub->rho))) { + return 0; + } + + uint8_t *vectork_output; + if (!CBB_add_space(out, &vectork_output, 320 * K)) { + return 0; + } + vectork_encode(vectork_output, &pub->t1, 10); + + return 1; +} + +// FIPS 204, Algorithm 17 (`pkDecode`). +static int dilithium_parse_public_key(struct public_key *pub, CBS *in) { + if (!CBS_copy_bytes(in, pub->rho, sizeof(pub->rho))) { + return 0; + } + + CBS t1_bytes; + if (!CBS_get_bytes(in, &t1_bytes, 320 * K)) { + return 0; + } + vectork_decode(&pub->t1, CBS_data(&t1_bytes), 10); + + return 1; +} + +// FIPS 204, Algorithm 18 (`skEncode`). +static int dilithium_marshal_private_key(CBB *out, + const struct private_key *priv) { + if (!CBB_add_bytes(out, priv->rho, sizeof(priv->rho)) || + !CBB_add_bytes(out, priv->k, sizeof(priv->k)) || + !CBB_add_bytes(out, priv->public_key_hash, + sizeof(priv->public_key_hash))) { + return 0; + } + + uint8_t *vectorl_output; + if (!CBB_add_space(out, &vectorl_output, 128 * L)) { + return 0; + } + vectorl_encode_signed(vectorl_output, &priv->s1, 4, ETA); + + uint8_t *vectork_output; + if (!CBB_add_space(out, &vectork_output, 128 * K)) { + return 0; + } + vectork_encode_signed(vectork_output, &priv->s2, 4, ETA); + + if (!CBB_add_space(out, &vectork_output, 416 * K)) { + return 0; + } + vectork_encode_signed(vectork_output, &priv->t0, 13, 1 << 12); + + return 1; +} + +// FIPS 204, Algorithm 19 (`skDecode`). +static int dilithium_parse_private_key(struct private_key *priv, CBS *in) { + CBS s1_bytes; + CBS s2_bytes; + CBS t0_bytes; + if (!CBS_copy_bytes(in, priv->rho, sizeof(priv->rho)) || + !CBS_copy_bytes(in, priv->k, sizeof(priv->k)) || + !CBS_copy_bytes(in, priv->public_key_hash, + sizeof(priv->public_key_hash)) || + !CBS_get_bytes(in, &s1_bytes, 128 * L) || + !vectorl_decode_signed(&priv->s1, CBS_data(&s1_bytes), 4, ETA) || + !CBS_get_bytes(in, &s2_bytes, 128 * K) || + !vectork_decode_signed(&priv->s2, CBS_data(&s2_bytes), 4, ETA) || + !CBS_get_bytes(in, &t0_bytes, 416 * K) || + // Note: Decoding 13 bits into (-2^12, 2^12] cannot fail. + !vectork_decode_signed(&priv->t0, CBS_data(&t0_bytes), 13, 1 << 12)) { + return 0; + } + + return 1; +} + +// FIPS 204, Algorithm 20 (`sigEncode`). +static int dilithium_marshal_signature(CBB *out, const struct signature *sign) { + if (!CBB_add_bytes(out, sign->c_tilde, sizeof(sign->c_tilde))) { + return 0; + } + + uint8_t *vectorl_output; + if (!CBB_add_space(out, &vectorl_output, 640 * L)) { + return 0; + } + vectorl_encode_signed(vectorl_output, &sign->z, 20, 1 << 19); + + uint8_t *hint_output; + if (!CBB_add_space(out, &hint_output, OMEGA + K)) { + return 0; + } + hint_bit_pack(hint_output, &sign->h); + + return 1; +} + +// FIPS 204, Algorithm 21 (`sigDecode`). +static int dilithium_parse_signature(struct signature *sign, CBS *in) { + CBS z_bytes; + CBS hint_bytes; + if (!CBS_copy_bytes(in, sign->c_tilde, sizeof(sign->c_tilde)) || + !CBS_get_bytes(in, &z_bytes, 640 * L) || + // Note: Decoding 20 bits into (-2^19, 2^19] cannot fail. + !vectorl_decode_signed(&sign->z, CBS_data(&z_bytes), 20, 1 << 19) || + !CBS_get_bytes(in, &hint_bytes, OMEGA + K) || + !hint_bit_unpack(&sign->h, CBS_data(&hint_bytes))) { + return 0; + }; + + return 1; +} + +static struct private_key *private_key_from_external( + const struct DILITHIUM_private_key *external) { + static_assert( + sizeof(struct DILITHIUM_private_key) == sizeof(struct private_key), + "Kyber private key size incorrect"); + static_assert( + alignof(struct DILITHIUM_private_key) == alignof(struct private_key), + "Kyber private key align incorrect"); + return (struct private_key *)external; +} + +static struct public_key *public_key_from_external( + const struct DILITHIUM_public_key *external) { + static_assert( + sizeof(struct DILITHIUM_public_key) == sizeof(struct public_key), + "Dilithium public key size incorrect"); + static_assert( + alignof(struct DILITHIUM_public_key) == alignof(struct public_key), + "Dilithium public key align incorrect"); + return (struct public_key *)external; +} + +/* API */ + +// Calls |DILITHIUM_generate_key_external_entropy| with random bytes from +// |RAND_bytes|. Returns 1 on success and 0 on failure. +int DILITHIUM_generate_key( + uint8_t out_encoded_public_key[DILITHIUM_PUBLIC_KEY_BYTES], + struct DILITHIUM_private_key *out_private_key) { + uint8_t entropy[DILITHIUM_GENERATE_KEY_ENTROPY]; + RAND_bytes(entropy, sizeof(entropy)); + return DILITHIUM_generate_key_external_entropy(out_encoded_public_key, + out_private_key, entropy); +} + +// FIPS 204, Algorithm 1 (`ML-DSA.KeyGen`). Returns 1 on success and 0 on +// failure. +int DILITHIUM_generate_key_external_entropy( + uint8_t out_encoded_public_key[DILITHIUM_PUBLIC_KEY_BYTES], + struct DILITHIUM_private_key *out_private_key, + const uint8_t entropy[DILITHIUM_GENERATE_KEY_ENTROPY]) { + int ret = 0; + + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct values_st { + struct public_key pub; + matrix a_ntt; + vectorl s1_ntt; + vectork t; + }; + struct values_st *values = OPENSSL_malloc(sizeof(*values)); + if (values == NULL) { + goto err; + } + + struct private_key *priv = private_key_from_external(out_private_key); + + uint8_t expanded_seed[RHO_BYTES + SIGMA_BYTES + K_BYTES]; + BORINGSSL_keccak(expanded_seed, sizeof(expanded_seed), entropy, + DILITHIUM_GENERATE_KEY_ENTROPY, boringssl_shake256); + const uint8_t *const rho = expanded_seed; + const uint8_t *const sigma = expanded_seed + RHO_BYTES; + const uint8_t *const k = expanded_seed + RHO_BYTES + SIGMA_BYTES; + // rho is public. + CONSTTIME_DECLASSIFY(rho, RHO_BYTES); + OPENSSL_memcpy(values->pub.rho, rho, sizeof(values->pub.rho)); + OPENSSL_memcpy(priv->rho, rho, sizeof(priv->rho)); + OPENSSL_memcpy(priv->k, k, sizeof(priv->k)); + + matrix_expand(&values->a_ntt, rho); + vector_expand_short(&priv->s1, &priv->s2, sigma); + + OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt)); + vectorl_ntt(&values->s1_ntt); + + matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt); + vectork_inverse_ntt(&values->t); + vectork_add(&values->t, &values->t, &priv->s2); + + vectork_power2_round(&values->pub.t1, &priv->t0, &values->t); + // t1 is public. + CONSTTIME_DECLASSIFY(&values->pub.t1, sizeof(values->pub.t1)); + + CBB cbb; + CBB_init_fixed(&cbb, out_encoded_public_key, DILITHIUM_PUBLIC_KEY_BYTES); + if (!dilithium_marshal_public_key(&cbb, &values->pub)) { + goto err; + } + + BORINGSSL_keccak(priv->public_key_hash, sizeof(priv->public_key_hash), + out_encoded_public_key, DILITHIUM_PUBLIC_KEY_BYTES, + boringssl_shake256); + + ret = 1; +err: + OPENSSL_free(values); + return ret; +} + +int DILITHIUM_public_from_private( + struct DILITHIUM_public_key *out_public_key, + const struct DILITHIUM_private_key *private_key) { + int ret = 0; + + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct values_st { + matrix a_ntt; + vectorl s1_ntt; + vectork t; + vectork t0; + }; + struct values_st *values = OPENSSL_malloc(sizeof(*values)); + if (values == NULL) { + goto err; + } + + const struct private_key *priv = private_key_from_external(private_key); + struct public_key *pub = public_key_from_external(out_public_key); + + OPENSSL_memcpy(pub->rho, priv->rho, sizeof(pub->rho)); + OPENSSL_memcpy(pub->public_key_hash, priv->public_key_hash, + sizeof(pub->public_key_hash)); + + matrix_expand(&values->a_ntt, priv->rho); + + OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt)); + vectorl_ntt(&values->s1_ntt); + + matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt); + vectork_inverse_ntt(&values->t); + vectork_add(&values->t, &values->t, &priv->s2); + + vectork_power2_round(&pub->t1, &values->t0, &values->t); + + ret = 1; +err: + OPENSSL_free(values); + return ret; +} + +// FIPS 204, Algorithm 2 (`ML-DSA.Sign`). Returns 1 on success and 0 on failure. +static int dilithium_sign_with_randomizer( + uint8_t out_encoded_signature[DILITHIUM_SIGNATURE_BYTES], + const struct DILITHIUM_private_key *private_key, const uint8_t *msg, + size_t msg_len, + const uint8_t randomizer[DILITHIUM_SIGNATURE_RANDOMIZER_BYTES]) { + int ret = 0; + + const struct private_key *priv = private_key_from_external(private_key); + + uint8_t mu[MU_BYTES]; + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, priv->public_key_hash, + sizeof(priv->public_key_hash)); + BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len); + BORINGSSL_keccak_squeeze(&keccak_ctx, mu, MU_BYTES); + + uint8_t rho_prime[RHO_PRIME_BYTES]; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, priv->k, sizeof(priv->k)); + BORINGSSL_keccak_absorb(&keccak_ctx, randomizer, + DILITHIUM_SIGNATURE_RANDOMIZER_BYTES); + BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES); + BORINGSSL_keccak_squeeze(&keccak_ctx, rho_prime, RHO_PRIME_BYTES); + + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct values_st { + struct signature sign; + vectorl s1_ntt; + vectork s2_ntt; + vectork t0_ntt; + matrix a_ntt; + vectorl y; + vectorl y_ntt; + vectork w; + vectork w1; + vectorl cs1; + vectork cs2; + vectork r0; + vectork ct0; + }; + struct values_st *values = OPENSSL_malloc(sizeof(*values)); + if (values == NULL) { + goto err; + } + OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt)); + vectorl_ntt(&values->s1_ntt); + + OPENSSL_memcpy(&values->s2_ntt, &priv->s2, sizeof(values->s2_ntt)); + vectork_ntt(&values->s2_ntt); + + OPENSSL_memcpy(&values->t0_ntt, &priv->t0, sizeof(values->t0_ntt)); + vectork_ntt(&values->t0_ntt); + + matrix_expand(&values->a_ntt, priv->rho); + + for (size_t kappa = 0;; kappa += L) { + // TODO(bbe): y only lives long enough to compute y_ntt. + // consider using another vectorl to save memory. + vectorl_expand_mask(&values->y, rho_prime, kappa); + + OPENSSL_memcpy(&values->y_ntt, &values->y, sizeof(values->y_ntt)); + vectorl_ntt(&values->y_ntt); + + // TODO(bbe): w only lives long enough to compute y_ntt. + // consider using another vectork to save memory. + matrix_mult(&values->w, &values->a_ntt, &values->y_ntt); + vectork_inverse_ntt(&values->w); + + vectork_high_bits(&values->w1, &values->w); + uint8_t w1_encoded[128 * K]; + w1_encode(w1_encoded, &values->w1); + + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES); + BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K); + BORINGSSL_keccak_squeeze(&keccak_ctx, values->sign.c_tilde, + 2 * LAMBDA_BYTES); + + scalar c_ntt; + scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde, 32); + scalar_ntt(&c_ntt); + + vectorl_mult_scalar(&values->cs1, &values->s1_ntt, &c_ntt); + vectorl_inverse_ntt(&values->cs1); + vectork_mult_scalar(&values->cs2, &values->s2_ntt, &c_ntt); + vectork_inverse_ntt(&values->cs2); + + vectorl_add(&values->sign.z, &values->y, &values->cs1); + + vectork_sub(&values->r0, &values->w, &values->cs2); + vectork_low_bits(&values->r0, &values->r0); + + // Leaking the fact that a signature was rejected is fine as the next + // attempt at a signature will be (indistinguishable from) independent of + // this one. Note, however, that we additionally leak which of the two + // branches rejected the signature. Section 5.5 of + // https://pq-crystals.org/dilithium/data/dilithium-specification-round3.pdf + // describes this leak as OK. Note we leak less than what is described by + // the paper; we do not reveal which coefficient violated the bound, and we + // hide which of the |z_max| or |r0_max| bound failed. See also + // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/2bbab0fa_d241d35a/ + uint32_t z_max = vectorl_max(&values->sign.z); + uint32_t r0_max = vectork_max_signed(&values->r0); + if (constant_time_declassify_w( + constant_time_ge_w(z_max, kGamma1 - BETA) | + constant_time_ge_w(r0_max, kGamma2 - BETA))) { + continue; + } + + vectork_mult_scalar(&values->ct0, &values->t0_ntt, &c_ntt); + vectork_inverse_ntt(&values->ct0); + vectork_make_hint(&values->sign.h, &values->ct0, &values->cs2, &values->w); + + // See above. + uint32_t ct0_max = vectork_max(&values->ct0); + size_t h_ones = vectork_count_ones(&values->sign.h); + if (constant_time_declassify_w(constant_time_ge_w(ct0_max, kGamma2) | + constant_time_lt_w(OMEGA, h_ones))) { + continue; + } + + // Although computed with the private key, the signature is public. + CONSTTIME_DECLASSIFY(values->sign.c_tilde, sizeof(values->sign.c_tilde)); + CONSTTIME_DECLASSIFY(&values->sign.z, sizeof(values->sign.z)); + CONSTTIME_DECLASSIFY(&values->sign.h, sizeof(values->sign.h)); + + CBB cbb; + CBB_init_fixed(&cbb, out_encoded_signature, DILITHIUM_SIGNATURE_BYTES); + if (!dilithium_marshal_signature(&cbb, &values->sign)) { + goto err; + } + + BSSL_CHECK(CBB_len(&cbb) == DILITHIUM_SIGNATURE_BYTES); + ret = 1; + break; + } + +err: + OPENSSL_free(values); + return ret; +} + +// Dilithium signature in deterministic mode. Returns 1 on success and 0 on +// failure. +int DILITHIUM_sign_deterministic( + uint8_t out_encoded_signature[DILITHIUM_SIGNATURE_BYTES], + const struct DILITHIUM_private_key *private_key, const uint8_t *msg, + size_t msg_len) { + uint8_t randomizer[DILITHIUM_SIGNATURE_RANDOMIZER_BYTES]; + OPENSSL_memset(randomizer, 0, sizeof(randomizer)); + return dilithium_sign_with_randomizer(out_encoded_signature, private_key, msg, + msg_len, randomizer); +} + +// Dilithium signature in randomized mode, filling the random bytes with +// |RAND_bytes|. Returns 1 on success and 0 on failure. +int DILITHIUM_sign(uint8_t out_encoded_signature[DILITHIUM_SIGNATURE_BYTES], + const struct DILITHIUM_private_key *private_key, + const uint8_t *msg, size_t msg_len) { + uint8_t randomizer[DILITHIUM_SIGNATURE_RANDOMIZER_BYTES]; + RAND_bytes(randomizer, sizeof(randomizer)); + return dilithium_sign_with_randomizer(out_encoded_signature, private_key, msg, + msg_len, randomizer); +} + +// FIPS 204, Algorithm 3 (`ML-DSA.Verify`). +int DILITHIUM_verify(const struct DILITHIUM_public_key *public_key, + const uint8_t encoded_signature[DILITHIUM_SIGNATURE_BYTES], + const uint8_t *msg, size_t msg_len) { + int ret = 0; + + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct values_st { + struct signature sign; + matrix a_ntt; + vectorl z_ntt; + vectork az_ntt; + vectork t1_ntt; + vectork ct1_ntt; + vectork w_approx; + vectork w1; + }; + struct values_st *values = OPENSSL_malloc(sizeof(*values)); + if (values == NULL) { + goto err; + } + + const struct public_key *pub = public_key_from_external(public_key); + + CBS cbs; + CBS_init(&cbs, encoded_signature, DILITHIUM_SIGNATURE_BYTES); + if (!dilithium_parse_signature(&values->sign, &cbs)) { + goto err; + } + + matrix_expand(&values->a_ntt, pub->rho); + + uint8_t mu[MU_BYTES]; + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, pub->public_key_hash, + sizeof(pub->public_key_hash)); + BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len); + BORINGSSL_keccak_squeeze(&keccak_ctx, mu, MU_BYTES); + + scalar c_ntt; + scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde, 32); + scalar_ntt(&c_ntt); + + OPENSSL_memcpy(&values->z_ntt, &values->sign.z, sizeof(values->z_ntt)); + vectorl_ntt(&values->z_ntt); + + matrix_mult(&values->az_ntt, &values->a_ntt, &values->z_ntt); + + vectork_scale_power2_round(&values->t1_ntt, &pub->t1); + vectork_ntt(&values->t1_ntt); + + vectork_mult_scalar(&values->ct1_ntt, &values->t1_ntt, &c_ntt); + + vectork_sub(&values->w_approx, &values->az_ntt, &values->ct1_ntt); + vectork_inverse_ntt(&values->w_approx); + + vectork_use_hint_vartime(&values->w1, &values->sign.h, &values->w_approx); + uint8_t w1_encoded[128 * K]; + w1_encode(w1_encoded, &values->w1); + + uint8_t c_tilde[2 * LAMBDA_BYTES]; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES); + BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K); + BORINGSSL_keccak_squeeze(&keccak_ctx, c_tilde, 2 * LAMBDA_BYTES); + + uint32_t z_max = vectorl_max(&values->sign.z); + size_t h_ones = vectork_count_ones(&values->sign.h); + if (z_max < kGamma1 - BETA && h_ones <= OMEGA && + OPENSSL_memcmp(c_tilde, values->sign.c_tilde, 2 * LAMBDA_BYTES) == 0) { + ret = 1; + } + +err: + OPENSSL_free(values); + return ret; +} + +/* Serialization of keys. */ + +int DILITHIUM_marshal_public_key( + CBB *out, const struct DILITHIUM_public_key *public_key) { + return dilithium_marshal_public_key(out, + public_key_from_external(public_key)); +} + +int DILITHIUM_parse_public_key(struct DILITHIUM_public_key *public_key, + CBS *in) { + struct public_key *pub = public_key_from_external(public_key); + CBS orig_in = *in; + if (!dilithium_parse_public_key(pub, in) || CBS_len(in) != 0) { + return 0; + } + + // Compute pre-cached values. + BORINGSSL_keccak(pub->public_key_hash, sizeof(pub->public_key_hash), + CBS_data(&orig_in), CBS_len(&orig_in), boringssl_shake256); + return 1; +} + +int DILITHIUM_marshal_private_key( + CBB *out, const struct DILITHIUM_private_key *private_key) { + return dilithium_marshal_private_key(out, + private_key_from_external(private_key)); +} + +int DILITHIUM_parse_private_key(struct DILITHIUM_private_key *private_key, + CBS *in) { + struct private_key *priv = private_key_from_external(private_key); + return dilithium_parse_private_key(priv, in) && CBS_len(in) == 0; +} diff --git a/Sources/CCryptoBoringSSL/crypto/dilithium/internal.h b/Sources/CCryptoBoringSSL/crypto/dilithium/internal.h new file mode 100644 index 00000000..a8a7b3d1 --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/dilithium/internal.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2023, Google LLC + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_CRYPTO_DILITHIUM_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_DILITHIUM_INTERNAL_H + +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + + +// DILITHIUM_GENERATE_KEY_ENTROPY is the number of bytes of uniformly random +// entropy necessary to generate a key pair. +#define DILITHIUM_GENERATE_KEY_ENTROPY 32 + +// DILITHIUM_SIGNATURE_RANDOMIZER_BYTES is the number of bytes of uniformly +// random entropy necessary to generate a signature in randomized mode. +#define DILITHIUM_SIGNATURE_RANDOMIZER_BYTES 32 + +// DILITHIUM_generate_key_external_entropy generates a public/private key pair +// using the given seed, writes the encoded public key to +// |out_encoded_public_key| and sets |out_private_key| to the private key, +// returning 1 on success and 0 on failure. Returns 1 on success and 0 on +// failure. +OPENSSL_EXPORT int DILITHIUM_generate_key_external_entropy( + uint8_t out_encoded_public_key[DILITHIUM_PUBLIC_KEY_BYTES], + struct DILITHIUM_private_key *out_private_key, + const uint8_t entropy[DILITHIUM_GENERATE_KEY_ENTROPY]); + +// DILITHIUM_sign_deterministic generates a signature for the message |msg| of +// length |msg_len| using |private_key| following the deterministic algorithm, +// and writes the encoded signature to |out_encoded_signature|. Returns 1 on +// success and 0 on failure. +OPENSSL_EXPORT int DILITHIUM_sign_deterministic( + uint8_t out_encoded_signature[DILITHIUM_SIGNATURE_BYTES], + const struct DILITHIUM_private_key *private_key, const uint8_t *msg, + size_t msg_len); + + +#if defined(__cplusplus) +} // extern C +#endif + +#endif // OPENSSL_HEADER_CRYPTO_DILITHIUM_INTERNAL_H diff --git a/Sources/CCryptoBoringSSL/crypto/dsa/dsa.c b/Sources/CCryptoBoringSSL/crypto/dsa/dsa.c index 269fe6fb..40be40e7 100644 --- a/Sources/CCryptoBoringSSL/crypto/dsa/dsa.c +++ b/Sources/CCryptoBoringSSL/crypto/dsa/dsa.c @@ -208,6 +208,11 @@ int DSA_set0_pqg(DSA *dsa, BIGNUM *p, BIGNUM *q, BIGNUM *g) { int DSA_generate_parameters_ex(DSA *dsa, unsigned bits, const uint8_t *seed_in, size_t seed_len, int *out_counter, unsigned long *out_h, BN_GENCB *cb) { + if (bits > OPENSSL_DSA_MAX_MODULUS_BITS) { + OPENSSL_PUT_ERROR(DSA, DSA_R_INVALID_PARAMETERS); + return 0; + } + int ok = 0; unsigned char seed[SHA256_DIGEST_LENGTH]; unsigned char md[SHA256_DIGEST_LENGTH]; @@ -274,6 +279,8 @@ int DSA_generate_parameters_ex(DSA *dsa, unsigned bits, const uint8_t *seed_in, if (!RAND_bytes(seed, qsize)) { goto err; } + // DSA parameters are public. + CONSTTIME_DECLASSIFY(seed, qsize); } else { // If we come back through, use random seed next time. seed_in = NULL; @@ -477,11 +484,13 @@ DSA *DSAparams_dup(const DSA *dsa) { } int DSA_generate_key(DSA *dsa) { + if (!dsa_check_key(dsa)) { + return 0; + } + int ok = 0; - BN_CTX *ctx = NULL; BIGNUM *pub_key = NULL, *priv_key = NULL; - - ctx = BN_CTX_new(); + BN_CTX *ctx = BN_CTX_new(); if (ctx == NULL) { goto err; } @@ -513,6 +522,9 @@ int DSA_generate_key(DSA *dsa) { goto err; } + // The public key is computed from the private key, but is public. + bn_declassify(pub_key); + dsa->priv_key = priv_key; dsa->pub_key = pub_key; ok = 1; @@ -649,6 +661,10 @@ DSA_SIG *DSA_do_sign(const uint8_t *digest, size_t digest_len, const DSA *dsa) { goto err; } + // The signature is computed from the private key, but is public. + bn_declassify(r); + bn_declassify(s); + // Redo if r or s is zero as required by FIPS 186-3: this is // very unlikely. if (BN_is_zero(r) || BN_is_zero(s)) { @@ -681,7 +697,7 @@ DSA_SIG *DSA_do_sign(const uint8_t *digest, size_t digest_len, const DSA *dsa) { return ret; } -int DSA_do_verify(const uint8_t *digest, size_t digest_len, DSA_SIG *sig, +int DSA_do_verify(const uint8_t *digest, size_t digest_len, const DSA_SIG *sig, const DSA *dsa) { int valid; if (!DSA_do_check_signature(&valid, digest, digest_len, sig, dsa)) { @@ -691,7 +707,8 @@ int DSA_do_verify(const uint8_t *digest, size_t digest_len, DSA_SIG *sig, } int DSA_do_check_signature(int *out_valid, const uint8_t *digest, - size_t digest_len, DSA_SIG *sig, const DSA *dsa) { + size_t digest_len, const DSA_SIG *sig, + const DSA *dsa) { *out_valid = 0; if (!dsa_check_key(dsa)) { return 0; @@ -899,15 +916,19 @@ static int dsa_sign_setup(const DSA *dsa, BN_CTX *ctx, BIGNUM **out_kinv, ctx) || // Compute r = (g^k mod p) mod q !BN_mod_exp_mont_consttime(r, dsa->g, &k, dsa->p, ctx, - dsa->method_mont_p) || - // Note |BN_mod| below is not constant-time and may leak information about - // |r|. |dsa->p| may be significantly larger than |dsa->q|, so this is not - // easily performed in constant-time with Montgomery reduction. - // - // However, |r| at this point is g^k (mod p). It is almost the value of - // |r| revealed in the signature anyway (g^k (mod p) (mod q)), going from - // it to |k| would require computing a discrete log. - !BN_mod(r, r, dsa->q, ctx) || + dsa->method_mont_p)) { + OPENSSL_PUT_ERROR(DSA, ERR_R_BN_LIB); + goto err; + } + // Note |BN_mod| below is not constant-time and may leak information about + // |r|. |dsa->p| may be significantly larger than |dsa->q|, so this is not + // easily performed in constant-time with Montgomery reduction. + // + // However, |r| at this point is g^k (mod p). It is almost the value of |r| + // revealed in the signature anyway (g^k (mod p) (mod q)), going from it to + // |k| would require computing a discrete log. + bn_declassify(r); + if (!BN_mod(r, r, dsa->q, ctx) || // Compute part of 's = inv(k) (m + xr) mod q' using Fermat's Little // Theorem. !bn_mod_inverse_prime(kinv, &k, dsa->q, ctx, dsa->method_mont_q)) { diff --git a/Sources/CCryptoBoringSSL/crypto/dsa/dsa_asn1.c b/Sources/CCryptoBoringSSL/crypto/dsa/dsa_asn1.c index 9c6dfaed..010c3465 100644 --- a/Sources/CCryptoBoringSSL/crypto/dsa/dsa_asn1.c +++ b/Sources/CCryptoBoringSSL/crypto/dsa/dsa_asn1.c @@ -65,8 +65,6 @@ #include "../bytestring/internal.h" -#define OPENSSL_DSA_MAX_MODULUS_BITS 10000 - // This function is in dsa_asn1.c rather than dsa.c because it is reachable from // |EVP_PKEY| parsers. This makes it easier for the static linker to drop most // of the DSA implementation. @@ -119,8 +117,9 @@ int dsa_check_key(const DSA *dsa) { if (dsa->priv_key != NULL) { // The private key is a non-zero element of the scalar field, determined by // |q|. - if (BN_is_negative(dsa->priv_key) || BN_is_zero(dsa->priv_key) || - BN_cmp(dsa->priv_key, dsa->q) >= 0) { + if (BN_is_negative(dsa->priv_key) || + constant_time_declassify_int(BN_is_zero(dsa->priv_key)) || + constant_time_declassify_int(BN_cmp(dsa->priv_key, dsa->q) >= 0)) { OPENSSL_PUT_ERROR(DSA, DSA_R_INVALID_PARAMETERS); return 0; } diff --git a/Sources/CCryptoBoringSSL/crypto/dsa/internal.h b/Sources/CCryptoBoringSSL/crypto/dsa/internal.h index eb537d6f..44a40934 100644 --- a/Sources/CCryptoBoringSSL/crypto/dsa/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/dsa/internal.h @@ -42,6 +42,8 @@ struct dsa_st { CRYPTO_EX_DATA ex_data; }; +#define OPENSSL_DSA_MAX_MODULUS_BITS 10000 + // dsa_check_key performs cheap self-checks on |dsa|, and ensures it is within // DoS bounds. It returns one on success and zero on error. int dsa_check_key(const DSA *dsa); diff --git a/Sources/CCryptoBoringSSL/crypto/ecdsa_extra/ecdsa_asn1.c b/Sources/CCryptoBoringSSL/crypto/ecdsa_extra/ecdsa_asn1.c index 3650858e..11c9deb9 100644 --- a/Sources/CCryptoBoringSSL/crypto/ecdsa_extra/ecdsa_asn1.c +++ b/Sources/CCryptoBoringSSL/crypto/ecdsa_extra/ecdsa_asn1.c @@ -62,34 +62,87 @@ #include #include "../bytestring/internal.h" -#include "../fipsmodule/ec/internal.h" +#include "../fipsmodule/ecdsa/internal.h" #include "../internal.h" +static ECDSA_SIG *ecdsa_sig_from_fixed(const EC_KEY *key, const uint8_t *in, + size_t len) { + const EC_GROUP *group = EC_KEY_get0_group(key); + if (group == NULL) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER); + return NULL; + } + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + if (len != 2 * scalar_len) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); + return NULL; + } + ECDSA_SIG *ret = ECDSA_SIG_new(); + if (ret == NULL || + !BN_bin2bn(in, scalar_len, ret->r) || + !BN_bin2bn(in + scalar_len, scalar_len, ret->s)) { + ECDSA_SIG_free(ret); + return NULL; + } + return ret; +} + +static int ecdsa_sig_to_fixed(const EC_KEY *key, uint8_t *out, size_t *out_len, + size_t max_out, const ECDSA_SIG *sig) { + const EC_GROUP *group = EC_KEY_get0_group(key); + if (group == NULL) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + if (max_out < 2 * scalar_len) { + OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); + return 0; + } + if (BN_is_negative(sig->r) || + !BN_bn2bin_padded(out, scalar_len, sig->r) || + BN_is_negative(sig->s) || + !BN_bn2bin_padded(out + scalar_len, scalar_len, sig->s)) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); + return 0; + } + *out_len = 2 * scalar_len; + return 1; +} + int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig, - unsigned int *sig_len, const EC_KEY *eckey) { + unsigned int *out_sig_len, const EC_KEY *eckey) { if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) { - return eckey->ecdsa_meth->sign(digest, digest_len, sig, sig_len, + return eckey->ecdsa_meth->sign(digest, digest_len, sig, out_sig_len, (EC_KEY*) eckey /* cast away const */); } - int ret = 0; - ECDSA_SIG *s = ECDSA_do_sign(digest, digest_len, eckey); + *out_sig_len = 0; + uint8_t fixed[ECDSA_MAX_FIXED_LEN]; + size_t fixed_len; + if (!ecdsa_sign_fixed(digest, digest_len, fixed, &fixed_len, sizeof(fixed), + eckey)) { + return 0; + } + + // TODO(davidben): We can actually do better and go straight from the DER + // format to the fixed-width format without a malloc. + ECDSA_SIG *s = ecdsa_sig_from_fixed(eckey, fixed, fixed_len); if (s == NULL) { - *sig_len = 0; - goto err; + return 0; } + int ret = 0; CBB cbb; CBB_init_fixed(&cbb, sig, ECDSA_size(eckey)); size_t len; if (!ECDSA_SIG_marshal(&cbb, s) || !CBB_finish(&cbb, NULL, &len)) { OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_ENCODE_ERROR); - *sig_len = 0; goto err; } - *sig_len = (unsigned)len; + *out_sig_len = (unsigned)len; ret = 1; err: @@ -99,12 +152,13 @@ int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig, int ECDSA_verify(int type, const uint8_t *digest, size_t digest_len, const uint8_t *sig, size_t sig_len, const EC_KEY *eckey) { - ECDSA_SIG *s; + // Decode the ECDSA signature. + // + // TODO(davidben): We can actually do better and go straight from the DER + // format to the fixed-width format without a malloc. int ret = 0; uint8_t *der = NULL; - - // Decode the ECDSA signature. - s = ECDSA_SIG_from_bytes(sig, sig_len); + ECDSA_SIG *s = ECDSA_SIG_from_bytes(sig, sig_len); if (s == NULL) { goto err; } @@ -118,7 +172,10 @@ int ECDSA_verify(int type, const uint8_t *digest, size_t digest_len, goto err; } - ret = ECDSA_do_verify(digest, digest_len, s, eckey); + uint8_t fixed[ECDSA_MAX_FIXED_LEN]; + size_t fixed_len; + ret = ecdsa_sig_to_fixed(eckey, fixed, &fixed_len, sizeof(fixed), s) && + ecdsa_verify_fixed(digest, digest_len, fixed, fixed_len, eckey); err: OPENSSL_free(der); @@ -147,6 +204,95 @@ size_t ECDSA_size(const EC_KEY *key) { return ECDSA_SIG_max_len(group_order_size); } +ECDSA_SIG *ECDSA_SIG_new(void) { + ECDSA_SIG *sig = OPENSSL_malloc(sizeof(ECDSA_SIG)); + if (sig == NULL) { + return NULL; + } + sig->r = BN_new(); + sig->s = BN_new(); + if (sig->r == NULL || sig->s == NULL) { + ECDSA_SIG_free(sig); + return NULL; + } + return sig; +} + +void ECDSA_SIG_free(ECDSA_SIG *sig) { + if (sig == NULL) { + return; + } + + BN_free(sig->r); + BN_free(sig->s); + OPENSSL_free(sig); +} + +const BIGNUM *ECDSA_SIG_get0_r(const ECDSA_SIG *sig) { + return sig->r; +} + +const BIGNUM *ECDSA_SIG_get0_s(const ECDSA_SIG *sig) { + return sig->s; +} + +void ECDSA_SIG_get0(const ECDSA_SIG *sig, const BIGNUM **out_r, + const BIGNUM **out_s) { + if (out_r != NULL) { + *out_r = sig->r; + } + if (out_s != NULL) { + *out_s = sig->s; + } +} + +int ECDSA_SIG_set0(ECDSA_SIG *sig, BIGNUM *r, BIGNUM *s) { + if (r == NULL || s == NULL) { + return 0; + } + BN_free(sig->r); + BN_free(sig->s); + sig->r = r; + sig->s = s; + return 1; +} + +int ECDSA_do_verify(const uint8_t *digest, size_t digest_len, + const ECDSA_SIG *sig, const EC_KEY *eckey) { + uint8_t fixed[ECDSA_MAX_FIXED_LEN]; + size_t fixed_len; + return ecdsa_sig_to_fixed(eckey, fixed, &fixed_len, sizeof(fixed), sig) && + ecdsa_verify_fixed(digest, digest_len, fixed, fixed_len, eckey); +} + +// This function is only exported for testing and is not called in production +// code. +ECDSA_SIG *ECDSA_sign_with_nonce_and_leak_private_key_for_testing( + const uint8_t *digest, size_t digest_len, const EC_KEY *eckey, + const uint8_t *nonce, size_t nonce_len) { + uint8_t sig[ECDSA_MAX_FIXED_LEN]; + size_t sig_len; + if (!ecdsa_sign_fixed_with_nonce_for_known_answer_test( + digest, digest_len, sig, &sig_len, sizeof(sig), eckey, nonce, + nonce_len)) { + return NULL; + } + + return ecdsa_sig_from_fixed(eckey, sig, sig_len); +} + +ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len, + const EC_KEY *eckey) { + uint8_t sig[ECDSA_MAX_FIXED_LEN]; + size_t sig_len; + if (!ecdsa_sign_fixed(digest, digest_len, sig, &sig_len, sizeof(sig), + eckey)) { + return NULL; + } + + return ecdsa_sig_from_fixed(eckey, sig, sig_len); +} + ECDSA_SIG *ECDSA_SIG_parse(CBS *cbs) { ECDSA_SIG *ret = ECDSA_SIG_new(); if (ret == NULL) { diff --git a/Sources/CCryptoBoringSSL/crypto/err/err.c b/Sources/CCryptoBoringSSL/crypto/err/err.c index 5504c6a2..36a07eac 100644 --- a/Sources/CCryptoBoringSSL/crypto/err/err.c +++ b/Sources/CCryptoBoringSSL/crypto/err/err.c @@ -164,6 +164,17 @@ extern const uint32_t kOpenSSLReasonValues[]; extern const size_t kOpenSSLReasonValuesLen; extern const char kOpenSSLReasonStringData[]; +static char *strdup_libc_malloc(const char *str) { + // |strdup| is not in C until C23, so MSVC triggers deprecation warnings, and + // glibc and musl gate it on a feature macro. Reimplementing it is easier. + size_t len = strlen(str); + char *ret = malloc(len + 1); + if (ret != NULL) { + memcpy(ret, str, len + 1); + } + return ret; +} + // err_clear clears the given queued error. static void err_clear(struct err_error_st *error) { free(error->data); @@ -174,13 +185,9 @@ static void err_copy(struct err_error_st *dst, const struct err_error_st *src) { err_clear(dst); dst->file = src->file; if (src->data != NULL) { - // Disable deprecated functions on msvc so it doesn't complain about strdup. - OPENSSL_MSVC_PRAGMA(warning(push)) - OPENSSL_MSVC_PRAGMA(warning(disable : 4996)) // We can't use OPENSSL_strdup because we don't want to call OPENSSL_malloc, // which can affect the error stack. - dst->data = strdup(src->data); - OPENSSL_MSVC_PRAGMA(warning(pop)) + dst->data = strdup_libc_malloc(src->data); } dst->packed = src->packed; dst->line = src->line; @@ -767,13 +774,9 @@ void ERR_set_error_data(char *data, int flags) { assert(0); return; } - // Disable deprecated functions on msvc so it doesn't complain about strdup. - OPENSSL_MSVC_PRAGMA(warning(push)) - OPENSSL_MSVC_PRAGMA(warning(disable : 4996)) // We can not use OPENSSL_strdup because we don't want to call OPENSSL_malloc, // which can affect the error stack. - char *copy = strdup(data); - OPENSSL_MSVC_PRAGMA(warning(pop)) + char *copy = strdup_libc_malloc(data); if (copy != NULL) { err_set_error_data(copy); } diff --git a/Sources/CCryptoBoringSSL/crypto/evp/evp.c b/Sources/CCryptoBoringSSL/crypto/evp/evp.c index aa31111a..db9fae49 100644 --- a/Sources/CCryptoBoringSSL/crypto/evp/evp.c +++ b/Sources/CCryptoBoringSSL/crypto/evp/evp.c @@ -59,12 +59,9 @@ #include #include -#include -#include #include #include #include -#include #include #include "internal.h" @@ -149,9 +146,7 @@ int EVP_PKEY_cmp(const EVP_PKEY *a, const EVP_PKEY *b) { int EVP_PKEY_copy_parameters(EVP_PKEY *to, const EVP_PKEY *from) { if (to->type == EVP_PKEY_NONE) { - if (!EVP_PKEY_set_type(to, from->type)) { - return 0; - } + evp_pkey_set_method(to, from->ameth); } else if (to->type != from->type) { OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_KEY_TYPES); return 0; @@ -225,117 +220,21 @@ static const EVP_PKEY_ASN1_METHOD *evp_pkey_asn1_find(int nid) { } } -static void evp_pkey_set_method(EVP_PKEY *pkey, - const EVP_PKEY_ASN1_METHOD *method) { +void evp_pkey_set_method(EVP_PKEY *pkey, const EVP_PKEY_ASN1_METHOD *method) { free_it(pkey); pkey->ameth = method; pkey->type = pkey->ameth->pkey_id; } int EVP_PKEY_type(int nid) { - const EVP_PKEY_ASN1_METHOD *meth = evp_pkey_asn1_find(nid); - if (meth == NULL) { - return NID_undef; - } - return meth->pkey_id; -} - -int EVP_PKEY_set1_RSA(EVP_PKEY *pkey, RSA *key) { - if (EVP_PKEY_assign_RSA(pkey, key)) { - RSA_up_ref(key); - return 1; - } - return 0; -} - -int EVP_PKEY_assign_RSA(EVP_PKEY *pkey, RSA *key) { - evp_pkey_set_method(pkey, &rsa_asn1_meth); - pkey->pkey = key; - return key != NULL; -} - -RSA *EVP_PKEY_get0_RSA(const EVP_PKEY *pkey) { - if (pkey->type != EVP_PKEY_RSA) { - OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_RSA_KEY); - return NULL; - } - return pkey->pkey; -} - -RSA *EVP_PKEY_get1_RSA(const EVP_PKEY *pkey) { - RSA *rsa = EVP_PKEY_get0_RSA(pkey); - if (rsa != NULL) { - RSA_up_ref(rsa); - } - return rsa; + // In OpenSSL, this was used to map between type aliases. BoringSSL supports + // no type aliases, so this function is just the identity. + return nid; } -int EVP_PKEY_set1_DSA(EVP_PKEY *pkey, DSA *key) { - if (EVP_PKEY_assign_DSA(pkey, key)) { - DSA_up_ref(key); - return 1; - } - return 0; -} - -int EVP_PKEY_assign_DSA(EVP_PKEY *pkey, DSA *key) { - evp_pkey_set_method(pkey, &dsa_asn1_meth); - pkey->pkey = key; - return key != NULL; -} - -DSA *EVP_PKEY_get0_DSA(const EVP_PKEY *pkey) { - if (pkey->type != EVP_PKEY_DSA) { - OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_A_DSA_KEY); - return NULL; - } - return pkey->pkey; -} - -DSA *EVP_PKEY_get1_DSA(const EVP_PKEY *pkey) { - DSA *dsa = EVP_PKEY_get0_DSA(pkey); - if (dsa != NULL) { - DSA_up_ref(dsa); - } - return dsa; -} - -int EVP_PKEY_set1_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) { - if (EVP_PKEY_assign_EC_KEY(pkey, key)) { - EC_KEY_up_ref(key); - return 1; - } - return 0; -} - -int EVP_PKEY_assign_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) { - evp_pkey_set_method(pkey, &ec_asn1_meth); - pkey->pkey = key; - return key != NULL; -} - -EC_KEY *EVP_PKEY_get0_EC_KEY(const EVP_PKEY *pkey) { - if (pkey->type != EVP_PKEY_EC) { - OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_EC_KEY_KEY); - return NULL; - } - return pkey->pkey; -} - -EC_KEY *EVP_PKEY_get1_EC_KEY(const EVP_PKEY *pkey) { - EC_KEY *ec_key = EVP_PKEY_get0_EC_KEY(pkey); - if (ec_key != NULL) { - EC_KEY_up_ref(ec_key); - } - return ec_key; -} - -DH *EVP_PKEY_get0_DH(const EVP_PKEY *pkey) { return NULL; } -DH *EVP_PKEY_get1_DH(const EVP_PKEY *pkey) { return NULL; } - int EVP_PKEY_assign(EVP_PKEY *pkey, int type, void *key) { - // This function can only be used to assign RSA, DSA, and EC keys. Other key - // types have internal representations which are not exposed through the + // This function can only be used to assign RSA, DSA, EC, and DH keys. Other + // key types have internal representations which are not exposed through the // public API. switch (type) { case EVP_PKEY_RSA: @@ -344,6 +243,8 @@ int EVP_PKEY_assign(EVP_PKEY *pkey, int type, void *key) { return EVP_PKEY_assign_DSA(pkey, key); case EVP_PKEY_EC: return EVP_PKEY_assign_EC_KEY(pkey, key); + case EVP_PKEY_DH: + return EVP_PKEY_assign_DH(pkey, key); } OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); @@ -375,16 +276,26 @@ int EVP_PKEY_set_type(EVP_PKEY *pkey, int type) { EVP_PKEY *EVP_PKEY_new_raw_private_key(int type, ENGINE *unused, const uint8_t *in, size_t len) { - EVP_PKEY *ret = EVP_PKEY_new(); - if (ret == NULL || - !EVP_PKEY_set_type(ret, type)) { - goto err; + // To avoid pulling in all key types, look for specifically the key types that + // support |set_priv_raw|. + const EVP_PKEY_ASN1_METHOD *method; + switch (type) { + case EVP_PKEY_X25519: + method = &x25519_asn1_meth; + break; + case EVP_PKEY_ED25519: + method = &ed25519_asn1_meth; + break; + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return 0; } - if (ret->ameth->set_priv_raw == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + EVP_PKEY *ret = EVP_PKEY_new(); + if (ret == NULL) { goto err; } + evp_pkey_set_method(ret, method); if (!ret->ameth->set_priv_raw(ret, in, len)) { goto err; @@ -399,16 +310,26 @@ EVP_PKEY *EVP_PKEY_new_raw_private_key(int type, ENGINE *unused, EVP_PKEY *EVP_PKEY_new_raw_public_key(int type, ENGINE *unused, const uint8_t *in, size_t len) { - EVP_PKEY *ret = EVP_PKEY_new(); - if (ret == NULL || - !EVP_PKEY_set_type(ret, type)) { - goto err; + // To avoid pulling in all key types, look for specifically the key types that + // support |set_pub_raw|. + const EVP_PKEY_ASN1_METHOD *method; + switch (type) { + case EVP_PKEY_X25519: + method = &x25519_asn1_meth; + break; + case EVP_PKEY_ED25519: + method = &ed25519_asn1_meth; + break; + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return 0; } - if (ret->ameth->set_pub_raw == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + EVP_PKEY *ret = EVP_PKEY_new(); + if (ret == NULL) { goto err; } + evp_pkey_set_method(ret, method); if (!ret->ameth->set_pub_raw(ret, in, len)) { goto err; diff --git a/Sources/CCryptoBoringSSL/crypto/evp/evp_asn1.c b/Sources/CCryptoBoringSSL/crypto/evp/evp_asn1.c index e2ad27f3..49413be4 100644 --- a/Sources/CCryptoBoringSSL/crypto/evp/evp_asn1.c +++ b/Sources/CCryptoBoringSSL/crypto/evp/evp_asn1.c @@ -69,6 +69,7 @@ #include "../internal.h" +// We intentionally omit |dh_asn1_meth| from this list. It is not serializable. static const EVP_PKEY_ASN1_METHOD *const kASN1Methods[] = { &rsa_asn1_meth, &ec_asn1_meth, @@ -77,28 +78,26 @@ static const EVP_PKEY_ASN1_METHOD *const kASN1Methods[] = { &x25519_asn1_meth, }; -static int parse_key_type(CBS *cbs, int *out_type) { +static const EVP_PKEY_ASN1_METHOD *parse_key_type(CBS *cbs) { CBS oid; if (!CBS_get_asn1(cbs, &oid, CBS_ASN1_OBJECT)) { - return 0; + return NULL; } for (unsigned i = 0; i < OPENSSL_ARRAY_SIZE(kASN1Methods); i++) { const EVP_PKEY_ASN1_METHOD *method = kASN1Methods[i]; if (CBS_len(&oid) == method->oid_len && OPENSSL_memcmp(CBS_data(&oid), method->oid, method->oid_len) == 0) { - *out_type = method->pkey_id; - return 1; + return method; } } - return 0; + return NULL; } EVP_PKEY *EVP_parse_public_key(CBS *cbs) { // Parse the SubjectPublicKeyInfo. CBS spki, algorithm, key; - int type; uint8_t padding; if (!CBS_get_asn1(cbs, &spki, CBS_ASN1_SEQUENCE) || !CBS_get_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || @@ -107,7 +106,8 @@ EVP_PKEY *EVP_parse_public_key(CBS *cbs) { OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); return NULL; } - if (!parse_key_type(&algorithm, &type)) { + const EVP_PKEY_ASN1_METHOD *method = parse_key_type(&algorithm); + if (method == NULL) { OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); return NULL; } @@ -121,10 +121,10 @@ EVP_PKEY *EVP_parse_public_key(CBS *cbs) { // Set up an |EVP_PKEY| of the appropriate type. EVP_PKEY *ret = EVP_PKEY_new(); - if (ret == NULL || - !EVP_PKEY_set_type(ret, type)) { + if (ret == NULL) { goto err; } + evp_pkey_set_method(ret, method); // Call into the type-specific SPKI decoding function. if (ret->ameth->pub_decode == NULL) { @@ -155,7 +155,6 @@ EVP_PKEY *EVP_parse_private_key(CBS *cbs) { // Parse the PrivateKeyInfo. CBS pkcs8, algorithm, key; uint64_t version; - int type; if (!CBS_get_asn1(cbs, &pkcs8, CBS_ASN1_SEQUENCE) || !CBS_get_asn1_uint64(&pkcs8, &version) || version != 0 || @@ -164,7 +163,8 @@ EVP_PKEY *EVP_parse_private_key(CBS *cbs) { OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); return NULL; } - if (!parse_key_type(&algorithm, &type)) { + const EVP_PKEY_ASN1_METHOD *method = parse_key_type(&algorithm); + if (method == NULL) { OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); return NULL; } @@ -173,10 +173,10 @@ EVP_PKEY *EVP_parse_private_key(CBS *cbs) { // Set up an |EVP_PKEY| of the appropriate type. EVP_PKEY *ret = EVP_PKEY_new(); - if (ret == NULL || - !EVP_PKEY_set_type(ret, type)) { + if (ret == NULL) { goto err; } + evp_pkey_set_method(ret, method); // Call into the type-specific PrivateKeyInfo decoding function. if (ret->ameth->priv_decode == NULL) { diff --git a/Sources/CCryptoBoringSSL/crypto/evp/internal.h b/Sources/CCryptoBoringSSL/crypto/evp/internal.h index d8e0f119..3a9785b4 100644 --- a/Sources/CCryptoBoringSSL/crypto/evp/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/evp/internal.h @@ -213,6 +213,7 @@ OPENSSL_EXPORT int EVP_PKEY_CTX_ctrl(EVP_PKEY_CTX *ctx, int keytype, int optype, #define EVP_PKEY_CTRL_HKDF_KEY (EVP_PKEY_ALG_CTRL + 16) #define EVP_PKEY_CTRL_HKDF_SALT (EVP_PKEY_ALG_CTRL + 17) #define EVP_PKEY_CTRL_HKDF_INFO (EVP_PKEY_ALG_CTRL + 18) +#define EVP_PKEY_CTRL_DH_PAD (EVP_PKEY_ALG_CTRL + 19) struct evp_pkey_ctx_st { // Method associated with this operation @@ -288,12 +289,18 @@ extern const EVP_PKEY_ASN1_METHOD ec_asn1_meth; extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meth; extern const EVP_PKEY_ASN1_METHOD ed25519_asn1_meth; extern const EVP_PKEY_ASN1_METHOD x25519_asn1_meth; +extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth; extern const EVP_PKEY_METHOD rsa_pkey_meth; extern const EVP_PKEY_METHOD ec_pkey_meth; extern const EVP_PKEY_METHOD ed25519_pkey_meth; extern const EVP_PKEY_METHOD x25519_pkey_meth; extern const EVP_PKEY_METHOD hkdf_pkey_meth; +extern const EVP_PKEY_METHOD dh_pkey_meth; + +// evp_pkey_set_method behaves like |EVP_PKEY_set_type|, but takes a pointer to +// a method table. This avoids depending on every |EVP_PKEY_ASN1_METHOD|. +void evp_pkey_set_method(EVP_PKEY *pkey, const EVP_PKEY_ASN1_METHOD *method); #if defined(__cplusplus) diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_dh.c b/Sources/CCryptoBoringSSL/crypto/evp/p_dh.c new file mode 100644 index 00000000..5e901020 --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/evp/p_dh.c @@ -0,0 +1,137 @@ +/* + * Copyright 2006-2019 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include + +#include + +#include +#include +#include + +#include "internal.h" + + +typedef struct dh_pkey_ctx_st { + int pad; +} DH_PKEY_CTX; + +static int pkey_dh_init(EVP_PKEY_CTX *ctx) { + DH_PKEY_CTX *dctx = OPENSSL_zalloc(sizeof(DH_PKEY_CTX)); + if (dctx == NULL) { + return 0; + } + + ctx->data = dctx; + return 1; +} + +static int pkey_dh_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) { + if (!pkey_dh_init(dst)) { + return 0; + } + + const DH_PKEY_CTX *sctx = src->data; + DH_PKEY_CTX *dctx = dst->data; + dctx->pad = sctx->pad; + return 1; +} + +static void pkey_dh_cleanup(EVP_PKEY_CTX *ctx) { + OPENSSL_free(ctx->data); + ctx->data = NULL; +} + +static int pkey_dh_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) { + DH *dh = DH_new(); + if (dh == NULL || !EVP_PKEY_assign_DH(pkey, dh)) { + DH_free(dh); + return 0; + } + + if (ctx->pkey != NULL && !EVP_PKEY_copy_parameters(pkey, ctx->pkey)) { + return 0; + } + + return DH_generate_key(dh); +} + +static int pkey_dh_derive(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *out_len) { + DH_PKEY_CTX *dctx = ctx->data; + if (ctx->pkey == NULL || ctx->peerkey == NULL) { + OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); + return 0; + } + + DH *our_key = ctx->pkey->pkey; + DH *peer_key = ctx->peerkey->pkey; + if (our_key == NULL || peer_key == NULL) { + OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); + return 0; + } + + const BIGNUM *pub_key = DH_get0_pub_key(peer_key); + if (pub_key == NULL) { + OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); + return 0; + } + + if (out == NULL) { + *out_len = DH_size(our_key); + return 1; + } + + if (*out_len < (size_t)DH_size(our_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + int ret = dctx->pad ? DH_compute_key_padded(out, pub_key, our_key) + : DH_compute_key(out, pub_key, our_key); + if (ret < 0) { + return 0; + } + + assert(ret <= DH_size(our_key)); + *out_len = (size_t)ret; + return 1; +} + +static int pkey_dh_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) { + DH_PKEY_CTX *dctx = ctx->data; + switch (type) { + case EVP_PKEY_CTRL_PEER_KEY: + // |EVP_PKEY_derive_set_peer| requires the key implement this command, + // even if it is a no-op. + return 1; + + case EVP_PKEY_CTRL_DH_PAD: + dctx->pad = p1; + return 1; + + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); + return 0; + } +} + +const EVP_PKEY_METHOD dh_pkey_meth = { + .pkey_id = EVP_PKEY_DH, + .init = pkey_dh_init, + .copy = pkey_dh_copy, + .cleanup = pkey_dh_cleanup, + .keygen = pkey_dh_keygen, + .derive = pkey_dh_derive, + .ctrl = pkey_dh_ctrl, +}; + +int EVP_PKEY_CTX_set_dh_pad(EVP_PKEY_CTX *ctx, int pad) { + return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DH, EVP_PKEY_OP_DERIVE, + EVP_PKEY_CTRL_DH_PAD, pad, NULL); +} diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_dh_asn1.c b/Sources/CCryptoBoringSSL/crypto/evp/p_dh_asn1.c new file mode 100644 index 00000000..7132552e --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/evp/p_dh_asn1.c @@ -0,0 +1,120 @@ +/* + * Copyright 2006-2021 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include + +#include +#include +#include + +#include "internal.h" +#include "../internal.h" + + +static void dh_free(EVP_PKEY *pkey) { + DH_free(pkey->pkey); + pkey->pkey = NULL; +} + +static int dh_size(const EVP_PKEY *pkey) { return DH_size(pkey->pkey); } + +static int dh_bits(const EVP_PKEY *pkey) { return DH_bits(pkey->pkey); } + +static int dh_param_missing(const EVP_PKEY *pkey) { + const DH *dh = pkey->pkey; + return dh == NULL || DH_get0_p(dh) == NULL || DH_get0_g(dh) == NULL; +} + +static int dh_param_copy(EVP_PKEY *to, const EVP_PKEY *from) { + if (dh_param_missing(from)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS); + return 0; + } + + const DH *dh = from->pkey; + const BIGNUM *q_old = DH_get0_q(dh); + BIGNUM *p = BN_dup(DH_get0_p(dh)); + BIGNUM *q = q_old == NULL ? NULL : BN_dup(q_old); + BIGNUM *g = BN_dup(DH_get0_g(dh)); + if (p == NULL || (q_old != NULL && q == NULL) || g == NULL || + !DH_set0_pqg(to->pkey, p, q, g)) { + BN_free(p); + BN_free(q); + BN_free(g); + return 0; + } + + // |DH_set0_pqg| took ownership of |p|, |q|, and |g|. + return 1; +} + +static int dh_param_cmp(const EVP_PKEY *a, const EVP_PKEY *b) { + if (dh_param_missing(a) || dh_param_missing(b)) { + return -2; + } + + // Matching OpenSSL, only compare p and g for PKCS#3-style Diffie-Hellman. + // OpenSSL only checks q in X9.42-style Diffie-Hellman ("DHX"). + const DH *a_dh = a->pkey; + const DH *b_dh = b->pkey; + return BN_cmp(DH_get0_p(a_dh), DH_get0_p(b_dh)) == 0 && + BN_cmp(DH_get0_g(a_dh), DH_get0_g(b_dh)) == 0; +} + +static int dh_pub_cmp(const EVP_PKEY *a, const EVP_PKEY *b) { + if (dh_param_cmp(a, b) <= 0) { + return 0; + } + + const DH *a_dh = a->pkey; + const DH *b_dh = b->pkey; + return BN_cmp(DH_get0_pub_key(a_dh), DH_get0_pub_key(b_dh)) == 0; +} + +const EVP_PKEY_ASN1_METHOD dh_asn1_meth = { + .pkey_id = EVP_PKEY_DH, + .pkey_method = &dh_pkey_meth, + .pub_cmp = dh_pub_cmp, + .pkey_size = dh_size, + .pkey_bits = dh_bits, + .param_missing = dh_param_missing, + .param_copy = dh_param_copy, + .param_cmp = dh_param_cmp, + .pkey_free = dh_free, +}; + +int EVP_PKEY_set1_DH(EVP_PKEY *pkey, DH *key) { + if (EVP_PKEY_assign_DH(pkey, key)) { + DH_up_ref(key); + return 1; + } + return 0; +} + +int EVP_PKEY_assign_DH(EVP_PKEY *pkey, DH *key) { + evp_pkey_set_method(pkey, &dh_asn1_meth); + pkey->pkey = key; + return key != NULL; +} + +DH *EVP_PKEY_get0_DH(const EVP_PKEY *pkey) { + if (pkey->type != EVP_PKEY_DH) { + OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_A_DH_KEY); + return NULL; + } + return pkey->pkey; +} + +DH *EVP_PKEY_get1_DH(const EVP_PKEY *pkey) { + DH *dh = EVP_PKEY_get0_DH(pkey); + if (dh != NULL) { + DH_up_ref(dh); + } + return dh; +} diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_dsa_asn1.c b/Sources/CCryptoBoringSSL/crypto/evp/p_dsa_asn1.c index 02dc36ad..e7d13bba 100644 --- a/Sources/CCryptoBoringSSL/crypto/evp/p_dsa_asn1.c +++ b/Sources/CCryptoBoringSSL/crypto/evp/p_dsa_asn1.c @@ -306,3 +306,33 @@ int EVP_PKEY_CTX_set_dsa_paramgen_q_bits(EVP_PKEY_CTX *ctx, int qbits) { OPENSSL_PUT_ERROR(EVP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); return 0; } + +int EVP_PKEY_set1_DSA(EVP_PKEY *pkey, DSA *key) { + if (EVP_PKEY_assign_DSA(pkey, key)) { + DSA_up_ref(key); + return 1; + } + return 0; +} + +int EVP_PKEY_assign_DSA(EVP_PKEY *pkey, DSA *key) { + evp_pkey_set_method(pkey, &dsa_asn1_meth); + pkey->pkey = key; + return key != NULL; +} + +DSA *EVP_PKEY_get0_DSA(const EVP_PKEY *pkey) { + if (pkey->type != EVP_PKEY_DSA) { + OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_A_DSA_KEY); + return NULL; + } + return pkey->pkey; +} + +DSA *EVP_PKEY_get1_DSA(const EVP_PKEY *pkey) { + DSA *dsa = EVP_PKEY_get0_DSA(pkey); + if (dsa != NULL) { + DSA_up_ref(dsa); + } + return dsa; +} diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_ec.c b/Sources/CCryptoBoringSSL/crypto/evp/p_ec.c index cd523a9e..3cebd638 100644 --- a/Sources/CCryptoBoringSSL/crypto/evp/p_ec.c +++ b/Sources/CCryptoBoringSSL/crypto/evp/p_ec.c @@ -90,15 +90,14 @@ static int pkey_ec_init(EVP_PKEY_CTX *ctx) { } static int pkey_ec_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) { - EC_PKEY_CTX *dctx, *sctx; if (!pkey_ec_init(dst)) { return 0; } - sctx = src->data; - dctx = dst->data; + const EC_PKEY_CTX *sctx = src->data; + EC_PKEY_CTX *dctx = dst->data; dctx->md = sctx->md; - + dctx->gen_group = sctx->gen_group; return 1; } diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_ec_asn1.c b/Sources/CCryptoBoringSSL/crypto/evp/p_ec_asn1.c index 2724e590..2682cf8d 100644 --- a/Sources/CCryptoBoringSSL/crypto/evp/p_ec_asn1.c +++ b/Sources/CCryptoBoringSSL/crypto/evp/p_ec_asn1.c @@ -300,3 +300,33 @@ const EVP_PKEY_ASN1_METHOD ec_asn1_meth = { int_ec_free, }; + +int EVP_PKEY_set1_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) { + if (EVP_PKEY_assign_EC_KEY(pkey, key)) { + EC_KEY_up_ref(key); + return 1; + } + return 0; +} + +int EVP_PKEY_assign_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) { + evp_pkey_set_method(pkey, &ec_asn1_meth); + pkey->pkey = key; + return key != NULL; +} + +EC_KEY *EVP_PKEY_get0_EC_KEY(const EVP_PKEY *pkey) { + if (pkey->type != EVP_PKEY_EC) { + OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_EC_KEY_KEY); + return NULL; + } + return pkey->pkey; +} + +EC_KEY *EVP_PKEY_get1_EC_KEY(const EVP_PKEY *pkey) { + EC_KEY *ec_key = EVP_PKEY_get0_EC_KEY(pkey); + if (ec_key != NULL) { + EC_KEY_up_ref(ec_key); + } + return ec_key; +} diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_ed25519.c b/Sources/CCryptoBoringSSL/crypto/evp/p_ed25519.c index ffae3ca6..6aabdd1b 100644 --- a/Sources/CCryptoBoringSSL/crypto/evp/p_ed25519.c +++ b/Sources/CCryptoBoringSSL/crypto/evp/p_ed25519.c @@ -30,10 +30,7 @@ static int pkey_ed25519_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) { return 0; } - if (!EVP_PKEY_set_type(pkey, EVP_PKEY_ED25519)) { - OPENSSL_free(key); - return 0; - } + evp_pkey_set_method(pkey, &ed25519_asn1_meth); uint8_t pubkey_unused[32]; ED25519_keypair(pubkey_unused, key->key); diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_rsa_asn1.c b/Sources/CCryptoBoringSSL/crypto/evp/p_rsa_asn1.c index 72843ee5..fcba3e9f 100644 --- a/Sources/CCryptoBoringSSL/crypto/evp/p_rsa_asn1.c +++ b/Sources/CCryptoBoringSSL/crypto/evp/p_rsa_asn1.c @@ -209,3 +209,33 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meth = { int_rsa_free, }; + +int EVP_PKEY_set1_RSA(EVP_PKEY *pkey, RSA *key) { + if (EVP_PKEY_assign_RSA(pkey, key)) { + RSA_up_ref(key); + return 1; + } + return 0; +} + +int EVP_PKEY_assign_RSA(EVP_PKEY *pkey, RSA *key) { + evp_pkey_set_method(pkey, &rsa_asn1_meth); + pkey->pkey = key; + return key != NULL; +} + +RSA *EVP_PKEY_get0_RSA(const EVP_PKEY *pkey) { + if (pkey->type != EVP_PKEY_RSA) { + OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_RSA_KEY); + return NULL; + } + return pkey->pkey; +} + +RSA *EVP_PKEY_get1_RSA(const EVP_PKEY *pkey) { + RSA *rsa = EVP_PKEY_get0_RSA(pkey); + if (rsa != NULL) { + RSA_up_ref(rsa); + } + return rsa; +} diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_x25519.c b/Sources/CCryptoBoringSSL/crypto/evp/p_x25519.c index 4e1147b3..7762d9e6 100644 --- a/Sources/CCryptoBoringSSL/crypto/evp/p_x25519.c +++ b/Sources/CCryptoBoringSSL/crypto/evp/p_x25519.c @@ -30,10 +30,7 @@ static int pkey_x25519_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) { return 0; } - if (!EVP_PKEY_set_type(pkey, EVP_PKEY_X25519)) { - OPENSSL_free(key); - return 0; - } + evp_pkey_set_method(pkey, &x25519_asn1_meth); X25519_keypair(key->pub, key->priv); key->has_private = 1; diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c.inc similarity index 85% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c.inc index ade87963..0b978d4d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c.inc @@ -104,3 +104,24 @@ int AES_set_decrypt_key(const uint8_t *key, unsigned bits, AES_KEY *aeskey) { return aes_nohw_set_decrypt_key(key, bits, aeskey); } } + +#if defined(HWAES) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) +// On x86 and x86_64, |aes_hw_set_decrypt_key|, we implement +// |aes_hw_encrypt_key_to_decrypt_key| in assembly and rely on C code to combine +// the operations. +int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) { + int ret = aes_hw_set_encrypt_key(user_key, bits, key); + if (ret == 0) { + aes_hw_encrypt_key_to_decrypt_key(key); + } + return ret; +} + +int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) { + if (aes_hw_set_encrypt_key_alt_preferred()) { + return aes_hw_set_encrypt_key_alt(user_key, bits, key); + } else { + return aes_hw_set_encrypt_key_base(user_key, bits, key); + } +} +#endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes_nohw.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes_nohw.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes_nohw.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes_nohw.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/internal.h index 98b2a14d..b4990957 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/internal.h @@ -17,6 +17,8 @@ #include +#include + #include "../../internal.h" #if defined(__cplusplus) @@ -66,17 +68,41 @@ OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); } #if defined(HWAES) -int aes_hw_set_encrypt_key(const uint8_t *user_key, const int bits, - AES_KEY *key); -int aes_hw_set_decrypt_key(const uint8_t *user_key, const int bits, - AES_KEY *key); +int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, AES_KEY *key); +int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, AES_KEY *key); void aes_hw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key); void aes_hw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key); void aes_hw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, - const AES_KEY *key, uint8_t *ivec, const int enc); + const AES_KEY *key, uint8_t *ivec, int enc); void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, const AES_KEY *key, const uint8_t ivec[16]); +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +// On x86 and x86_64, |aes_hw_set_decrypt_key| is implemented in terms of +// |aes_hw_set_encrypt_key| and a conversion function. +void aes_hw_encrypt_key_to_decrypt_key(AES_KEY *key); + +// There are two variants of this function, one which uses aeskeygenassist +// ("base") and one which uses aesenclast + pshufb ("alt"). aesenclast is +// overall faster but is slower on some older processors. It doesn't use AVX, +// but AVX is used as a proxy to detecting this. See +// https://groups.google.com/g/mailing.openssl.dev/c/OuFXwW4NfO8/m/7d2ZXVjkxVkJ +// +// TODO(davidben): It is unclear if the aeskeygenassist version is still +// worthwhile. However, the aesenclast version requires SSSE3. SSSE3 long +// predates AES-NI, but it's not clear if AES-NI implies SSSE3. In OpenSSL, the +// CCM AES-NI assembly seems to assume it does. +OPENSSL_INLINE int aes_hw_set_encrypt_key_alt_capable(void) { + return hwaes_capable() && CRYPTO_is_SSSE3_capable(); +} +OPENSSL_INLINE int aes_hw_set_encrypt_key_alt_preferred(void) { + return hwaes_capable() && CRYPTO_is_AVX_capable(); +} +int aes_hw_set_encrypt_key_base(const uint8_t *user_key, int bits, + AES_KEY *key); +int aes_hw_set_encrypt_key_alt(const uint8_t *user_key, int bits, AES_KEY *key); +#endif // OPENSSL_X86 || OPENSSL_X86_64 + #else // If HWAES isn't defined then we provide dummy functions for each of the hwaes @@ -120,7 +146,7 @@ OPENSSL_INLINE void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, #if defined(HWAES_ECB) void aes_hw_ecb_encrypt(const uint8_t *in, uint8_t *out, size_t length, - const AES_KEY *key, const int enc); + const AES_KEY *key, int enc); #endif // HWAES_ECB @@ -218,7 +244,7 @@ void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t blocks, const AES_KEY *key, const uint8_t ivec[16]); void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t *ivec, const int enc); + const AES_KEY *key, uint8_t *ivec, int enc); #if defined(__cplusplus) diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/key_wrap.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/key_wrap.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/key_wrap.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/key_wrap.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/mode_wrappers.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/mode_wrappers.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/mode_wrappers.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/mode_wrappers.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-windows.windows.x86.S deleted file mode 100644 index cf3ab235..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-windows.windows.x86.S +++ /dev/null @@ -1,2473 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -;extern _OPENSSL_ia32cap_P -%ifdef BORINGSSL_DISPATCH_TEST -extern _BORINGSSL_function_hit -%endif -global _aes_hw_encrypt -align 16 -_aes_hw_encrypt: -L$_aes_hw_encrypt_begin: -%ifdef BORINGSSL_DISPATCH_TEST - push ebx - push edx - call L$000pic -L$000pic: - pop ebx - lea ebx,[(_BORINGSSL_function_hit+1-L$000pic)+ebx] - mov edx,1 - mov BYTE [ebx],dl - pop edx - pop ebx -%endif - mov eax,DWORD [4+esp] - mov edx,DWORD [12+esp] - movups xmm2,[eax] - mov ecx,DWORD [240+edx] - mov eax,DWORD [8+esp] - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$001enc1_loop_1: -db 102,15,56,220,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$001enc1_loop_1 -db 102,15,56,221,209 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - movups [eax],xmm2 - pxor xmm2,xmm2 - ret -global _aes_hw_decrypt -align 16 -_aes_hw_decrypt: -L$_aes_hw_decrypt_begin: - mov eax,DWORD [4+esp] - mov edx,DWORD [12+esp] - movups xmm2,[eax] - mov ecx,DWORD [240+edx] - mov eax,DWORD [8+esp] - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$002dec1_loop_2: -db 102,15,56,222,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$002dec1_loop_2 -db 102,15,56,223,209 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - movups [eax],xmm2 - pxor xmm2,xmm2 - ret -align 16 -__aesni_encrypt2: - movups xmm0,[edx] - shl ecx,4 - movups xmm1,[16+edx] - xorps xmm2,xmm0 - pxor xmm3,xmm0 - movups xmm0,[32+edx] - lea edx,[32+ecx*1+edx] - neg ecx - add ecx,16 -L$003enc2_loop: -db 102,15,56,220,209 -db 102,15,56,220,217 - movups xmm1,[ecx*1+edx] - add ecx,32 -db 102,15,56,220,208 -db 102,15,56,220,216 - movups xmm0,[ecx*1+edx-16] - jnz NEAR L$003enc2_loop -db 102,15,56,220,209 -db 102,15,56,220,217 -db 102,15,56,221,208 -db 102,15,56,221,216 - ret -align 16 -__aesni_decrypt2: - movups xmm0,[edx] - shl ecx,4 - movups xmm1,[16+edx] - xorps xmm2,xmm0 - pxor xmm3,xmm0 - movups xmm0,[32+edx] - lea edx,[32+ecx*1+edx] - neg ecx - add ecx,16 -L$004dec2_loop: -db 102,15,56,222,209 -db 102,15,56,222,217 - movups xmm1,[ecx*1+edx] - add ecx,32 -db 102,15,56,222,208 -db 102,15,56,222,216 - movups xmm0,[ecx*1+edx-16] - jnz NEAR L$004dec2_loop -db 102,15,56,222,209 -db 102,15,56,222,217 -db 102,15,56,223,208 -db 102,15,56,223,216 - ret -align 16 -__aesni_encrypt3: - movups xmm0,[edx] - shl ecx,4 - movups xmm1,[16+edx] - xorps xmm2,xmm0 - pxor xmm3,xmm0 - pxor xmm4,xmm0 - movups xmm0,[32+edx] - lea edx,[32+ecx*1+edx] - neg ecx - add ecx,16 -L$005enc3_loop: -db 102,15,56,220,209 -db 102,15,56,220,217 -db 102,15,56,220,225 - movups xmm1,[ecx*1+edx] - add ecx,32 -db 102,15,56,220,208 -db 102,15,56,220,216 -db 102,15,56,220,224 - movups xmm0,[ecx*1+edx-16] - jnz NEAR L$005enc3_loop -db 102,15,56,220,209 -db 102,15,56,220,217 -db 102,15,56,220,225 -db 102,15,56,221,208 -db 102,15,56,221,216 -db 102,15,56,221,224 - ret -align 16 -__aesni_decrypt3: - movups xmm0,[edx] - shl ecx,4 - movups xmm1,[16+edx] - xorps xmm2,xmm0 - pxor xmm3,xmm0 - pxor xmm4,xmm0 - movups xmm0,[32+edx] - lea edx,[32+ecx*1+edx] - neg ecx - add ecx,16 -L$006dec3_loop: -db 102,15,56,222,209 -db 102,15,56,222,217 -db 102,15,56,222,225 - movups xmm1,[ecx*1+edx] - add ecx,32 -db 102,15,56,222,208 -db 102,15,56,222,216 -db 102,15,56,222,224 - movups xmm0,[ecx*1+edx-16] - jnz NEAR L$006dec3_loop -db 102,15,56,222,209 -db 102,15,56,222,217 -db 102,15,56,222,225 -db 102,15,56,223,208 -db 102,15,56,223,216 -db 102,15,56,223,224 - ret -align 16 -__aesni_encrypt4: - movups xmm0,[edx] - movups xmm1,[16+edx] - shl ecx,4 - xorps xmm2,xmm0 - pxor xmm3,xmm0 - pxor xmm4,xmm0 - pxor xmm5,xmm0 - movups xmm0,[32+edx] - lea edx,[32+ecx*1+edx] - neg ecx -db 15,31,64,0 - add ecx,16 -L$007enc4_loop: -db 102,15,56,220,209 -db 102,15,56,220,217 -db 102,15,56,220,225 -db 102,15,56,220,233 - movups xmm1,[ecx*1+edx] - add ecx,32 -db 102,15,56,220,208 -db 102,15,56,220,216 -db 102,15,56,220,224 -db 102,15,56,220,232 - movups xmm0,[ecx*1+edx-16] - jnz NEAR L$007enc4_loop -db 102,15,56,220,209 -db 102,15,56,220,217 -db 102,15,56,220,225 -db 102,15,56,220,233 -db 102,15,56,221,208 -db 102,15,56,221,216 -db 102,15,56,221,224 -db 102,15,56,221,232 - ret -align 16 -__aesni_decrypt4: - movups xmm0,[edx] - movups xmm1,[16+edx] - shl ecx,4 - xorps xmm2,xmm0 - pxor xmm3,xmm0 - pxor xmm4,xmm0 - pxor xmm5,xmm0 - movups xmm0,[32+edx] - lea edx,[32+ecx*1+edx] - neg ecx -db 15,31,64,0 - add ecx,16 -L$008dec4_loop: -db 102,15,56,222,209 -db 102,15,56,222,217 -db 102,15,56,222,225 -db 102,15,56,222,233 - movups xmm1,[ecx*1+edx] - add ecx,32 -db 102,15,56,222,208 -db 102,15,56,222,216 -db 102,15,56,222,224 -db 102,15,56,222,232 - movups xmm0,[ecx*1+edx-16] - jnz NEAR L$008dec4_loop -db 102,15,56,222,209 -db 102,15,56,222,217 -db 102,15,56,222,225 -db 102,15,56,222,233 -db 102,15,56,223,208 -db 102,15,56,223,216 -db 102,15,56,223,224 -db 102,15,56,223,232 - ret -align 16 -__aesni_encrypt6: - movups xmm0,[edx] - shl ecx,4 - movups xmm1,[16+edx] - xorps xmm2,xmm0 - pxor xmm3,xmm0 - pxor xmm4,xmm0 -db 102,15,56,220,209 - pxor xmm5,xmm0 - pxor xmm6,xmm0 -db 102,15,56,220,217 - lea edx,[32+ecx*1+edx] - neg ecx -db 102,15,56,220,225 - pxor xmm7,xmm0 - movups xmm0,[ecx*1+edx] - add ecx,16 - jmp NEAR L$009_aesni_encrypt6_inner -align 16 -L$010enc6_loop: -db 102,15,56,220,209 -db 102,15,56,220,217 -db 102,15,56,220,225 -L$009_aesni_encrypt6_inner: -db 102,15,56,220,233 -db 102,15,56,220,241 -db 102,15,56,220,249 -L$_aesni_encrypt6_enter: - movups xmm1,[ecx*1+edx] - add ecx,32 -db 102,15,56,220,208 -db 102,15,56,220,216 -db 102,15,56,220,224 -db 102,15,56,220,232 -db 102,15,56,220,240 -db 102,15,56,220,248 - movups xmm0,[ecx*1+edx-16] - jnz NEAR L$010enc6_loop -db 102,15,56,220,209 -db 102,15,56,220,217 -db 102,15,56,220,225 -db 102,15,56,220,233 -db 102,15,56,220,241 -db 102,15,56,220,249 -db 102,15,56,221,208 -db 102,15,56,221,216 -db 102,15,56,221,224 -db 102,15,56,221,232 -db 102,15,56,221,240 -db 102,15,56,221,248 - ret -align 16 -__aesni_decrypt6: - movups xmm0,[edx] - shl ecx,4 - movups xmm1,[16+edx] - xorps xmm2,xmm0 - pxor xmm3,xmm0 - pxor xmm4,xmm0 -db 102,15,56,222,209 - pxor xmm5,xmm0 - pxor xmm6,xmm0 -db 102,15,56,222,217 - lea edx,[32+ecx*1+edx] - neg ecx -db 102,15,56,222,225 - pxor xmm7,xmm0 - movups xmm0,[ecx*1+edx] - add ecx,16 - jmp NEAR L$011_aesni_decrypt6_inner -align 16 -L$012dec6_loop: -db 102,15,56,222,209 -db 102,15,56,222,217 -db 102,15,56,222,225 -L$011_aesni_decrypt6_inner: -db 102,15,56,222,233 -db 102,15,56,222,241 -db 102,15,56,222,249 -L$_aesni_decrypt6_enter: - movups xmm1,[ecx*1+edx] - add ecx,32 -db 102,15,56,222,208 -db 102,15,56,222,216 -db 102,15,56,222,224 -db 102,15,56,222,232 -db 102,15,56,222,240 -db 102,15,56,222,248 - movups xmm0,[ecx*1+edx-16] - jnz NEAR L$012dec6_loop -db 102,15,56,222,209 -db 102,15,56,222,217 -db 102,15,56,222,225 -db 102,15,56,222,233 -db 102,15,56,222,241 -db 102,15,56,222,249 -db 102,15,56,223,208 -db 102,15,56,223,216 -db 102,15,56,223,224 -db 102,15,56,223,232 -db 102,15,56,223,240 -db 102,15,56,223,248 - ret -global _aes_hw_ecb_encrypt -align 16 -_aes_hw_ecb_encrypt: -L$_aes_hw_ecb_encrypt_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov edx,DWORD [32+esp] - mov ebx,DWORD [36+esp] - and eax,-16 - jz NEAR L$013ecb_ret - mov ecx,DWORD [240+edx] - test ebx,ebx - jz NEAR L$014ecb_decrypt - mov ebp,edx - mov ebx,ecx - cmp eax,96 - jb NEAR L$015ecb_enc_tail - movdqu xmm2,[esi] - movdqu xmm3,[16+esi] - movdqu xmm4,[32+esi] - movdqu xmm5,[48+esi] - movdqu xmm6,[64+esi] - movdqu xmm7,[80+esi] - lea esi,[96+esi] - sub eax,96 - jmp NEAR L$016ecb_enc_loop6_enter -align 16 -L$017ecb_enc_loop6: - movups [edi],xmm2 - movdqu xmm2,[esi] - movups [16+edi],xmm3 - movdqu xmm3,[16+esi] - movups [32+edi],xmm4 - movdqu xmm4,[32+esi] - movups [48+edi],xmm5 - movdqu xmm5,[48+esi] - movups [64+edi],xmm6 - movdqu xmm6,[64+esi] - movups [80+edi],xmm7 - lea edi,[96+edi] - movdqu xmm7,[80+esi] - lea esi,[96+esi] -L$016ecb_enc_loop6_enter: - call __aesni_encrypt6 - mov edx,ebp - mov ecx,ebx - sub eax,96 - jnc NEAR L$017ecb_enc_loop6 - movups [edi],xmm2 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - movups [48+edi],xmm5 - movups [64+edi],xmm6 - movups [80+edi],xmm7 - lea edi,[96+edi] - add eax,96 - jz NEAR L$013ecb_ret -L$015ecb_enc_tail: - movups xmm2,[esi] - cmp eax,32 - jb NEAR L$018ecb_enc_one - movups xmm3,[16+esi] - je NEAR L$019ecb_enc_two - movups xmm4,[32+esi] - cmp eax,64 - jb NEAR L$020ecb_enc_three - movups xmm5,[48+esi] - je NEAR L$021ecb_enc_four - movups xmm6,[64+esi] - xorps xmm7,xmm7 - call __aesni_encrypt6 - movups [edi],xmm2 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - movups [48+edi],xmm5 - movups [64+edi],xmm6 - jmp NEAR L$013ecb_ret -align 16 -L$018ecb_enc_one: - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$022enc1_loop_3: -db 102,15,56,220,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$022enc1_loop_3 -db 102,15,56,221,209 - movups [edi],xmm2 - jmp NEAR L$013ecb_ret -align 16 -L$019ecb_enc_two: - call __aesni_encrypt2 - movups [edi],xmm2 - movups [16+edi],xmm3 - jmp NEAR L$013ecb_ret -align 16 -L$020ecb_enc_three: - call __aesni_encrypt3 - movups [edi],xmm2 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - jmp NEAR L$013ecb_ret -align 16 -L$021ecb_enc_four: - call __aesni_encrypt4 - movups [edi],xmm2 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - movups [48+edi],xmm5 - jmp NEAR L$013ecb_ret -align 16 -L$014ecb_decrypt: - mov ebp,edx - mov ebx,ecx - cmp eax,96 - jb NEAR L$023ecb_dec_tail - movdqu xmm2,[esi] - movdqu xmm3,[16+esi] - movdqu xmm4,[32+esi] - movdqu xmm5,[48+esi] - movdqu xmm6,[64+esi] - movdqu xmm7,[80+esi] - lea esi,[96+esi] - sub eax,96 - jmp NEAR L$024ecb_dec_loop6_enter -align 16 -L$025ecb_dec_loop6: - movups [edi],xmm2 - movdqu xmm2,[esi] - movups [16+edi],xmm3 - movdqu xmm3,[16+esi] - movups [32+edi],xmm4 - movdqu xmm4,[32+esi] - movups [48+edi],xmm5 - movdqu xmm5,[48+esi] - movups [64+edi],xmm6 - movdqu xmm6,[64+esi] - movups [80+edi],xmm7 - lea edi,[96+edi] - movdqu xmm7,[80+esi] - lea esi,[96+esi] -L$024ecb_dec_loop6_enter: - call __aesni_decrypt6 - mov edx,ebp - mov ecx,ebx - sub eax,96 - jnc NEAR L$025ecb_dec_loop6 - movups [edi],xmm2 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - movups [48+edi],xmm5 - movups [64+edi],xmm6 - movups [80+edi],xmm7 - lea edi,[96+edi] - add eax,96 - jz NEAR L$013ecb_ret -L$023ecb_dec_tail: - movups xmm2,[esi] - cmp eax,32 - jb NEAR L$026ecb_dec_one - movups xmm3,[16+esi] - je NEAR L$027ecb_dec_two - movups xmm4,[32+esi] - cmp eax,64 - jb NEAR L$028ecb_dec_three - movups xmm5,[48+esi] - je NEAR L$029ecb_dec_four - movups xmm6,[64+esi] - xorps xmm7,xmm7 - call __aesni_decrypt6 - movups [edi],xmm2 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - movups [48+edi],xmm5 - movups [64+edi],xmm6 - jmp NEAR L$013ecb_ret -align 16 -L$026ecb_dec_one: - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$030dec1_loop_4: -db 102,15,56,222,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$030dec1_loop_4 -db 102,15,56,223,209 - movups [edi],xmm2 - jmp NEAR L$013ecb_ret -align 16 -L$027ecb_dec_two: - call __aesni_decrypt2 - movups [edi],xmm2 - movups [16+edi],xmm3 - jmp NEAR L$013ecb_ret -align 16 -L$028ecb_dec_three: - call __aesni_decrypt3 - movups [edi],xmm2 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - jmp NEAR L$013ecb_ret -align 16 -L$029ecb_dec_four: - call __aesni_decrypt4 - movups [edi],xmm2 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - movups [48+edi],xmm5 -L$013ecb_ret: - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - pxor xmm6,xmm6 - pxor xmm7,xmm7 - pop edi - pop esi - pop ebx - pop ebp - ret -global _aes_hw_ccm64_encrypt_blocks -align 16 -_aes_hw_ccm64_encrypt_blocks: -L$_aes_hw_ccm64_encrypt_blocks_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov edx,DWORD [32+esp] - mov ebx,DWORD [36+esp] - mov ecx,DWORD [40+esp] - mov ebp,esp - sub esp,60 - and esp,-16 - mov DWORD [48+esp],ebp - movdqu xmm7,[ebx] - movdqu xmm3,[ecx] - mov ecx,DWORD [240+edx] - mov DWORD [esp],202182159 - mov DWORD [4+esp],134810123 - mov DWORD [8+esp],67438087 - mov DWORD [12+esp],66051 - mov ebx,1 - xor ebp,ebp - mov DWORD [16+esp],ebx - mov DWORD [20+esp],ebp - mov DWORD [24+esp],ebp - mov DWORD [28+esp],ebp - shl ecx,4 - mov ebx,16 - lea ebp,[edx] - movdqa xmm5,[esp] - movdqa xmm2,xmm7 - lea edx,[32+ecx*1+edx] - sub ebx,ecx -db 102,15,56,0,253 -L$031ccm64_enc_outer: - movups xmm0,[ebp] - mov ecx,ebx - movups xmm6,[esi] - xorps xmm2,xmm0 - movups xmm1,[16+ebp] - xorps xmm0,xmm6 - xorps xmm3,xmm0 - movups xmm0,[32+ebp] -L$032ccm64_enc2_loop: -db 102,15,56,220,209 -db 102,15,56,220,217 - movups xmm1,[ecx*1+edx] - add ecx,32 -db 102,15,56,220,208 -db 102,15,56,220,216 - movups xmm0,[ecx*1+edx-16] - jnz NEAR L$032ccm64_enc2_loop -db 102,15,56,220,209 -db 102,15,56,220,217 - paddq xmm7,[16+esp] - dec eax -db 102,15,56,221,208 -db 102,15,56,221,216 - lea esi,[16+esi] - xorps xmm6,xmm2 - movdqa xmm2,xmm7 - movups [edi],xmm6 -db 102,15,56,0,213 - lea edi,[16+edi] - jnz NEAR L$031ccm64_enc_outer - mov esp,DWORD [48+esp] - mov edi,DWORD [40+esp] - movups [edi],xmm3 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - pxor xmm6,xmm6 - pxor xmm7,xmm7 - pop edi - pop esi - pop ebx - pop ebp - ret -global _aes_hw_ccm64_decrypt_blocks -align 16 -_aes_hw_ccm64_decrypt_blocks: -L$_aes_hw_ccm64_decrypt_blocks_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov edx,DWORD [32+esp] - mov ebx,DWORD [36+esp] - mov ecx,DWORD [40+esp] - mov ebp,esp - sub esp,60 - and esp,-16 - mov DWORD [48+esp],ebp - movdqu xmm7,[ebx] - movdqu xmm3,[ecx] - mov ecx,DWORD [240+edx] - mov DWORD [esp],202182159 - mov DWORD [4+esp],134810123 - mov DWORD [8+esp],67438087 - mov DWORD [12+esp],66051 - mov ebx,1 - xor ebp,ebp - mov DWORD [16+esp],ebx - mov DWORD [20+esp],ebp - mov DWORD [24+esp],ebp - mov DWORD [28+esp],ebp - movdqa xmm5,[esp] - movdqa xmm2,xmm7 - mov ebp,edx - mov ebx,ecx -db 102,15,56,0,253 - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$033enc1_loop_5: -db 102,15,56,220,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$033enc1_loop_5 -db 102,15,56,221,209 - shl ebx,4 - mov ecx,16 - movups xmm6,[esi] - paddq xmm7,[16+esp] - lea esi,[16+esi] - sub ecx,ebx - lea edx,[32+ebx*1+ebp] - mov ebx,ecx - jmp NEAR L$034ccm64_dec_outer -align 16 -L$034ccm64_dec_outer: - xorps xmm6,xmm2 - movdqa xmm2,xmm7 - movups [edi],xmm6 - lea edi,[16+edi] -db 102,15,56,0,213 - sub eax,1 - jz NEAR L$035ccm64_dec_break - movups xmm0,[ebp] - mov ecx,ebx - movups xmm1,[16+ebp] - xorps xmm6,xmm0 - xorps xmm2,xmm0 - xorps xmm3,xmm6 - movups xmm0,[32+ebp] -L$036ccm64_dec2_loop: -db 102,15,56,220,209 -db 102,15,56,220,217 - movups xmm1,[ecx*1+edx] - add ecx,32 -db 102,15,56,220,208 -db 102,15,56,220,216 - movups xmm0,[ecx*1+edx-16] - jnz NEAR L$036ccm64_dec2_loop - movups xmm6,[esi] - paddq xmm7,[16+esp] -db 102,15,56,220,209 -db 102,15,56,220,217 -db 102,15,56,221,208 -db 102,15,56,221,216 - lea esi,[16+esi] - jmp NEAR L$034ccm64_dec_outer -align 16 -L$035ccm64_dec_break: - mov ecx,DWORD [240+ebp] - mov edx,ebp - movups xmm0,[edx] - movups xmm1,[16+edx] - xorps xmm6,xmm0 - lea edx,[32+edx] - xorps xmm3,xmm6 -L$037enc1_loop_6: -db 102,15,56,220,217 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$037enc1_loop_6 -db 102,15,56,221,217 - mov esp,DWORD [48+esp] - mov edi,DWORD [40+esp] - movups [edi],xmm3 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - pxor xmm6,xmm6 - pxor xmm7,xmm7 - pop edi - pop esi - pop ebx - pop ebp - ret -global _aes_hw_ctr32_encrypt_blocks -align 16 -_aes_hw_ctr32_encrypt_blocks: -L$_aes_hw_ctr32_encrypt_blocks_begin: - push ebp - push ebx - push esi - push edi -%ifdef BORINGSSL_DISPATCH_TEST - push ebx - push edx - call L$038pic -L$038pic: - pop ebx - lea ebx,[(_BORINGSSL_function_hit+0-L$038pic)+ebx] - mov edx,1 - mov BYTE [ebx],dl - pop edx - pop ebx -%endif - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov edx,DWORD [32+esp] - mov ebx,DWORD [36+esp] - mov ebp,esp - sub esp,88 - and esp,-16 - mov DWORD [80+esp],ebp - cmp eax,1 - je NEAR L$039ctr32_one_shortcut - movdqu xmm7,[ebx] - mov DWORD [esp],202182159 - mov DWORD [4+esp],134810123 - mov DWORD [8+esp],67438087 - mov DWORD [12+esp],66051 - mov ecx,6 - xor ebp,ebp - mov DWORD [16+esp],ecx - mov DWORD [20+esp],ecx - mov DWORD [24+esp],ecx - mov DWORD [28+esp],ebp -db 102,15,58,22,251,3 -db 102,15,58,34,253,3 - mov ecx,DWORD [240+edx] - bswap ebx - pxor xmm0,xmm0 - pxor xmm1,xmm1 - movdqa xmm2,[esp] -db 102,15,58,34,195,0 - lea ebp,[3+ebx] -db 102,15,58,34,205,0 - inc ebx -db 102,15,58,34,195,1 - inc ebp -db 102,15,58,34,205,1 - inc ebx -db 102,15,58,34,195,2 - inc ebp -db 102,15,58,34,205,2 - movdqa [48+esp],xmm0 -db 102,15,56,0,194 - movdqu xmm6,[edx] - movdqa [64+esp],xmm1 -db 102,15,56,0,202 - pshufd xmm2,xmm0,192 - pshufd xmm3,xmm0,128 - cmp eax,6 - jb NEAR L$040ctr32_tail - pxor xmm7,xmm6 - shl ecx,4 - mov ebx,16 - movdqa [32+esp],xmm7 - mov ebp,edx - sub ebx,ecx - lea edx,[32+ecx*1+edx] - sub eax,6 - jmp NEAR L$041ctr32_loop6 -align 16 -L$041ctr32_loop6: - pshufd xmm4,xmm0,64 - movdqa xmm0,[32+esp] - pshufd xmm5,xmm1,192 - pxor xmm2,xmm0 - pshufd xmm6,xmm1,128 - pxor xmm3,xmm0 - pshufd xmm7,xmm1,64 - movups xmm1,[16+ebp] - pxor xmm4,xmm0 - pxor xmm5,xmm0 -db 102,15,56,220,209 - pxor xmm6,xmm0 - pxor xmm7,xmm0 -db 102,15,56,220,217 - movups xmm0,[32+ebp] - mov ecx,ebx -db 102,15,56,220,225 -db 102,15,56,220,233 -db 102,15,56,220,241 -db 102,15,56,220,249 - call L$_aesni_encrypt6_enter - movups xmm1,[esi] - movups xmm0,[16+esi] - xorps xmm2,xmm1 - movups xmm1,[32+esi] - xorps xmm3,xmm0 - movups [edi],xmm2 - movdqa xmm0,[16+esp] - xorps xmm4,xmm1 - movdqa xmm1,[64+esp] - movups [16+edi],xmm3 - movups [32+edi],xmm4 - paddd xmm1,xmm0 - paddd xmm0,[48+esp] - movdqa xmm2,[esp] - movups xmm3,[48+esi] - movups xmm4,[64+esi] - xorps xmm5,xmm3 - movups xmm3,[80+esi] - lea esi,[96+esi] - movdqa [48+esp],xmm0 -db 102,15,56,0,194 - xorps xmm6,xmm4 - movups [48+edi],xmm5 - xorps xmm7,xmm3 - movdqa [64+esp],xmm1 -db 102,15,56,0,202 - movups [64+edi],xmm6 - pshufd xmm2,xmm0,192 - movups [80+edi],xmm7 - lea edi,[96+edi] - pshufd xmm3,xmm0,128 - sub eax,6 - jnc NEAR L$041ctr32_loop6 - add eax,6 - jz NEAR L$042ctr32_ret - movdqu xmm7,[ebp] - mov edx,ebp - pxor xmm7,[32+esp] - mov ecx,DWORD [240+ebp] -L$040ctr32_tail: - por xmm2,xmm7 - cmp eax,2 - jb NEAR L$043ctr32_one - pshufd xmm4,xmm0,64 - por xmm3,xmm7 - je NEAR L$044ctr32_two - pshufd xmm5,xmm1,192 - por xmm4,xmm7 - cmp eax,4 - jb NEAR L$045ctr32_three - pshufd xmm6,xmm1,128 - por xmm5,xmm7 - je NEAR L$046ctr32_four - por xmm6,xmm7 - call __aesni_encrypt6 - movups xmm1,[esi] - movups xmm0,[16+esi] - xorps xmm2,xmm1 - movups xmm1,[32+esi] - xorps xmm3,xmm0 - movups xmm0,[48+esi] - xorps xmm4,xmm1 - movups xmm1,[64+esi] - xorps xmm5,xmm0 - movups [edi],xmm2 - xorps xmm6,xmm1 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - movups [48+edi],xmm5 - movups [64+edi],xmm6 - jmp NEAR L$042ctr32_ret -align 16 -L$039ctr32_one_shortcut: - movups xmm2,[ebx] - mov ecx,DWORD [240+edx] -L$043ctr32_one: - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$047enc1_loop_7: -db 102,15,56,220,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$047enc1_loop_7 -db 102,15,56,221,209 - movups xmm6,[esi] - xorps xmm6,xmm2 - movups [edi],xmm6 - jmp NEAR L$042ctr32_ret -align 16 -L$044ctr32_two: - call __aesni_encrypt2 - movups xmm5,[esi] - movups xmm6,[16+esi] - xorps xmm2,xmm5 - xorps xmm3,xmm6 - movups [edi],xmm2 - movups [16+edi],xmm3 - jmp NEAR L$042ctr32_ret -align 16 -L$045ctr32_three: - call __aesni_encrypt3 - movups xmm5,[esi] - movups xmm6,[16+esi] - xorps xmm2,xmm5 - movups xmm7,[32+esi] - xorps xmm3,xmm6 - movups [edi],xmm2 - xorps xmm4,xmm7 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - jmp NEAR L$042ctr32_ret -align 16 -L$046ctr32_four: - call __aesni_encrypt4 - movups xmm6,[esi] - movups xmm7,[16+esi] - movups xmm1,[32+esi] - xorps xmm2,xmm6 - movups xmm0,[48+esi] - xorps xmm3,xmm7 - movups [edi],xmm2 - xorps xmm4,xmm1 - movups [16+edi],xmm3 - xorps xmm5,xmm0 - movups [32+edi],xmm4 - movups [48+edi],xmm5 -L$042ctr32_ret: - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - movdqa [32+esp],xmm0 - pxor xmm5,xmm5 - movdqa [48+esp],xmm0 - pxor xmm6,xmm6 - movdqa [64+esp],xmm0 - pxor xmm7,xmm7 - mov esp,DWORD [80+esp] - pop edi - pop esi - pop ebx - pop ebp - ret -global _aes_hw_xts_encrypt -align 16 -_aes_hw_xts_encrypt: -L$_aes_hw_xts_encrypt_begin: - push ebp - push ebx - push esi - push edi - mov edx,DWORD [36+esp] - mov esi,DWORD [40+esp] - mov ecx,DWORD [240+edx] - movups xmm2,[esi] - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$048enc1_loop_8: -db 102,15,56,220,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$048enc1_loop_8 -db 102,15,56,221,209 - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov edx,DWORD [32+esp] - mov ebp,esp - sub esp,120 - mov ecx,DWORD [240+edx] - and esp,-16 - mov DWORD [96+esp],135 - mov DWORD [100+esp],0 - mov DWORD [104+esp],1 - mov DWORD [108+esp],0 - mov DWORD [112+esp],eax - mov DWORD [116+esp],ebp - movdqa xmm1,xmm2 - pxor xmm0,xmm0 - movdqa xmm3,[96+esp] - pcmpgtd xmm0,xmm1 - and eax,-16 - mov ebp,edx - mov ebx,ecx - sub eax,96 - jc NEAR L$049xts_enc_short - shl ecx,4 - mov ebx,16 - sub ebx,ecx - lea edx,[32+ecx*1+edx] - jmp NEAR L$050xts_enc_loop6 -align 16 -L$050xts_enc_loop6: - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa [esp],xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa [16+esp],xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa [32+esp],xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa [48+esp],xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - pshufd xmm7,xmm0,19 - movdqa [64+esp],xmm1 - paddq xmm1,xmm1 - movups xmm0,[ebp] - pand xmm7,xmm3 - movups xmm2,[esi] - pxor xmm7,xmm1 - mov ecx,ebx - movdqu xmm3,[16+esi] - xorps xmm2,xmm0 - movdqu xmm4,[32+esi] - pxor xmm3,xmm0 - movdqu xmm5,[48+esi] - pxor xmm4,xmm0 - movdqu xmm6,[64+esi] - pxor xmm5,xmm0 - movdqu xmm1,[80+esi] - pxor xmm6,xmm0 - lea esi,[96+esi] - pxor xmm2,[esp] - movdqa [80+esp],xmm7 - pxor xmm7,xmm1 - movups xmm1,[16+ebp] - pxor xmm3,[16+esp] - pxor xmm4,[32+esp] -db 102,15,56,220,209 - pxor xmm5,[48+esp] - pxor xmm6,[64+esp] -db 102,15,56,220,217 - pxor xmm7,xmm0 - movups xmm0,[32+ebp] -db 102,15,56,220,225 -db 102,15,56,220,233 -db 102,15,56,220,241 -db 102,15,56,220,249 - call L$_aesni_encrypt6_enter - movdqa xmm1,[80+esp] - pxor xmm0,xmm0 - xorps xmm2,[esp] - pcmpgtd xmm0,xmm1 - xorps xmm3,[16+esp] - movups [edi],xmm2 - xorps xmm4,[32+esp] - movups [16+edi],xmm3 - xorps xmm5,[48+esp] - movups [32+edi],xmm4 - xorps xmm6,[64+esp] - movups [48+edi],xmm5 - xorps xmm7,xmm1 - movups [64+edi],xmm6 - pshufd xmm2,xmm0,19 - movups [80+edi],xmm7 - lea edi,[96+edi] - movdqa xmm3,[96+esp] - pxor xmm0,xmm0 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - sub eax,96 - jnc NEAR L$050xts_enc_loop6 - mov ecx,DWORD [240+ebp] - mov edx,ebp - mov ebx,ecx -L$049xts_enc_short: - add eax,96 - jz NEAR L$051xts_enc_done6x - movdqa xmm5,xmm1 - cmp eax,32 - jb NEAR L$052xts_enc_one - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - je NEAR L$053xts_enc_two - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa xmm6,xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - cmp eax,64 - jb NEAR L$054xts_enc_three - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa xmm7,xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - movdqa [esp],xmm5 - movdqa [16+esp],xmm6 - je NEAR L$055xts_enc_four - movdqa [32+esp],xmm7 - pshufd xmm7,xmm0,19 - movdqa [48+esp],xmm1 - paddq xmm1,xmm1 - pand xmm7,xmm3 - pxor xmm7,xmm1 - movdqu xmm2,[esi] - movdqu xmm3,[16+esi] - movdqu xmm4,[32+esi] - pxor xmm2,[esp] - movdqu xmm5,[48+esi] - pxor xmm3,[16+esp] - movdqu xmm6,[64+esi] - pxor xmm4,[32+esp] - lea esi,[80+esi] - pxor xmm5,[48+esp] - movdqa [64+esp],xmm7 - pxor xmm6,xmm7 - call __aesni_encrypt6 - movaps xmm1,[64+esp] - xorps xmm2,[esp] - xorps xmm3,[16+esp] - xorps xmm4,[32+esp] - movups [edi],xmm2 - xorps xmm5,[48+esp] - movups [16+edi],xmm3 - xorps xmm6,xmm1 - movups [32+edi],xmm4 - movups [48+edi],xmm5 - movups [64+edi],xmm6 - lea edi,[80+edi] - jmp NEAR L$056xts_enc_done -align 16 -L$052xts_enc_one: - movups xmm2,[esi] - lea esi,[16+esi] - xorps xmm2,xmm5 - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$057enc1_loop_9: -db 102,15,56,220,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$057enc1_loop_9 -db 102,15,56,221,209 - xorps xmm2,xmm5 - movups [edi],xmm2 - lea edi,[16+edi] - movdqa xmm1,xmm5 - jmp NEAR L$056xts_enc_done -align 16 -L$053xts_enc_two: - movaps xmm6,xmm1 - movups xmm2,[esi] - movups xmm3,[16+esi] - lea esi,[32+esi] - xorps xmm2,xmm5 - xorps xmm3,xmm6 - call __aesni_encrypt2 - xorps xmm2,xmm5 - xorps xmm3,xmm6 - movups [edi],xmm2 - movups [16+edi],xmm3 - lea edi,[32+edi] - movdqa xmm1,xmm6 - jmp NEAR L$056xts_enc_done -align 16 -L$054xts_enc_three: - movaps xmm7,xmm1 - movups xmm2,[esi] - movups xmm3,[16+esi] - movups xmm4,[32+esi] - lea esi,[48+esi] - xorps xmm2,xmm5 - xorps xmm3,xmm6 - xorps xmm4,xmm7 - call __aesni_encrypt3 - xorps xmm2,xmm5 - xorps xmm3,xmm6 - xorps xmm4,xmm7 - movups [edi],xmm2 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - lea edi,[48+edi] - movdqa xmm1,xmm7 - jmp NEAR L$056xts_enc_done -align 16 -L$055xts_enc_four: - movaps xmm6,xmm1 - movups xmm2,[esi] - movups xmm3,[16+esi] - movups xmm4,[32+esi] - xorps xmm2,[esp] - movups xmm5,[48+esi] - lea esi,[64+esi] - xorps xmm3,[16+esp] - xorps xmm4,xmm7 - xorps xmm5,xmm6 - call __aesni_encrypt4 - xorps xmm2,[esp] - xorps xmm3,[16+esp] - xorps xmm4,xmm7 - movups [edi],xmm2 - xorps xmm5,xmm6 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - movups [48+edi],xmm5 - lea edi,[64+edi] - movdqa xmm1,xmm6 - jmp NEAR L$056xts_enc_done -align 16 -L$051xts_enc_done6x: - mov eax,DWORD [112+esp] - and eax,15 - jz NEAR L$058xts_enc_ret - movdqa xmm5,xmm1 - mov DWORD [112+esp],eax - jmp NEAR L$059xts_enc_steal -align 16 -L$056xts_enc_done: - mov eax,DWORD [112+esp] - pxor xmm0,xmm0 - and eax,15 - jz NEAR L$058xts_enc_ret - pcmpgtd xmm0,xmm1 - mov DWORD [112+esp],eax - pshufd xmm5,xmm0,19 - paddq xmm1,xmm1 - pand xmm5,[96+esp] - pxor xmm5,xmm1 -L$059xts_enc_steal: - movzx ecx,BYTE [esi] - movzx edx,BYTE [edi-16] - lea esi,[1+esi] - mov BYTE [edi-16],cl - mov BYTE [edi],dl - lea edi,[1+edi] - sub eax,1 - jnz NEAR L$059xts_enc_steal - sub edi,DWORD [112+esp] - mov edx,ebp - mov ecx,ebx - movups xmm2,[edi-16] - xorps xmm2,xmm5 - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$060enc1_loop_10: -db 102,15,56,220,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$060enc1_loop_10 -db 102,15,56,221,209 - xorps xmm2,xmm5 - movups [edi-16],xmm2 -L$058xts_enc_ret: - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - movdqa [esp],xmm0 - pxor xmm3,xmm3 - movdqa [16+esp],xmm0 - pxor xmm4,xmm4 - movdqa [32+esp],xmm0 - pxor xmm5,xmm5 - movdqa [48+esp],xmm0 - pxor xmm6,xmm6 - movdqa [64+esp],xmm0 - pxor xmm7,xmm7 - movdqa [80+esp],xmm0 - mov esp,DWORD [116+esp] - pop edi - pop esi - pop ebx - pop ebp - ret -global _aes_hw_xts_decrypt -align 16 -_aes_hw_xts_decrypt: -L$_aes_hw_xts_decrypt_begin: - push ebp - push ebx - push esi - push edi - mov edx,DWORD [36+esp] - mov esi,DWORD [40+esp] - mov ecx,DWORD [240+edx] - movups xmm2,[esi] - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$061enc1_loop_11: -db 102,15,56,220,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$061enc1_loop_11 -db 102,15,56,221,209 - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov edx,DWORD [32+esp] - mov ebp,esp - sub esp,120 - and esp,-16 - xor ebx,ebx - test eax,15 - setnz bl - shl ebx,4 - sub eax,ebx - mov DWORD [96+esp],135 - mov DWORD [100+esp],0 - mov DWORD [104+esp],1 - mov DWORD [108+esp],0 - mov DWORD [112+esp],eax - mov DWORD [116+esp],ebp - mov ecx,DWORD [240+edx] - mov ebp,edx - mov ebx,ecx - movdqa xmm1,xmm2 - pxor xmm0,xmm0 - movdqa xmm3,[96+esp] - pcmpgtd xmm0,xmm1 - and eax,-16 - sub eax,96 - jc NEAR L$062xts_dec_short - shl ecx,4 - mov ebx,16 - sub ebx,ecx - lea edx,[32+ecx*1+edx] - jmp NEAR L$063xts_dec_loop6 -align 16 -L$063xts_dec_loop6: - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa [esp],xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa [16+esp],xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa [32+esp],xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa [48+esp],xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - pshufd xmm7,xmm0,19 - movdqa [64+esp],xmm1 - paddq xmm1,xmm1 - movups xmm0,[ebp] - pand xmm7,xmm3 - movups xmm2,[esi] - pxor xmm7,xmm1 - mov ecx,ebx - movdqu xmm3,[16+esi] - xorps xmm2,xmm0 - movdqu xmm4,[32+esi] - pxor xmm3,xmm0 - movdqu xmm5,[48+esi] - pxor xmm4,xmm0 - movdqu xmm6,[64+esi] - pxor xmm5,xmm0 - movdqu xmm1,[80+esi] - pxor xmm6,xmm0 - lea esi,[96+esi] - pxor xmm2,[esp] - movdqa [80+esp],xmm7 - pxor xmm7,xmm1 - movups xmm1,[16+ebp] - pxor xmm3,[16+esp] - pxor xmm4,[32+esp] -db 102,15,56,222,209 - pxor xmm5,[48+esp] - pxor xmm6,[64+esp] -db 102,15,56,222,217 - pxor xmm7,xmm0 - movups xmm0,[32+ebp] -db 102,15,56,222,225 -db 102,15,56,222,233 -db 102,15,56,222,241 -db 102,15,56,222,249 - call L$_aesni_decrypt6_enter - movdqa xmm1,[80+esp] - pxor xmm0,xmm0 - xorps xmm2,[esp] - pcmpgtd xmm0,xmm1 - xorps xmm3,[16+esp] - movups [edi],xmm2 - xorps xmm4,[32+esp] - movups [16+edi],xmm3 - xorps xmm5,[48+esp] - movups [32+edi],xmm4 - xorps xmm6,[64+esp] - movups [48+edi],xmm5 - xorps xmm7,xmm1 - movups [64+edi],xmm6 - pshufd xmm2,xmm0,19 - movups [80+edi],xmm7 - lea edi,[96+edi] - movdqa xmm3,[96+esp] - pxor xmm0,xmm0 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - sub eax,96 - jnc NEAR L$063xts_dec_loop6 - mov ecx,DWORD [240+ebp] - mov edx,ebp - mov ebx,ecx -L$062xts_dec_short: - add eax,96 - jz NEAR L$064xts_dec_done6x - movdqa xmm5,xmm1 - cmp eax,32 - jb NEAR L$065xts_dec_one - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - je NEAR L$066xts_dec_two - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa xmm6,xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - cmp eax,64 - jb NEAR L$067xts_dec_three - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa xmm7,xmm1 - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 - movdqa [esp],xmm5 - movdqa [16+esp],xmm6 - je NEAR L$068xts_dec_four - movdqa [32+esp],xmm7 - pshufd xmm7,xmm0,19 - movdqa [48+esp],xmm1 - paddq xmm1,xmm1 - pand xmm7,xmm3 - pxor xmm7,xmm1 - movdqu xmm2,[esi] - movdqu xmm3,[16+esi] - movdqu xmm4,[32+esi] - pxor xmm2,[esp] - movdqu xmm5,[48+esi] - pxor xmm3,[16+esp] - movdqu xmm6,[64+esi] - pxor xmm4,[32+esp] - lea esi,[80+esi] - pxor xmm5,[48+esp] - movdqa [64+esp],xmm7 - pxor xmm6,xmm7 - call __aesni_decrypt6 - movaps xmm1,[64+esp] - xorps xmm2,[esp] - xorps xmm3,[16+esp] - xorps xmm4,[32+esp] - movups [edi],xmm2 - xorps xmm5,[48+esp] - movups [16+edi],xmm3 - xorps xmm6,xmm1 - movups [32+edi],xmm4 - movups [48+edi],xmm5 - movups [64+edi],xmm6 - lea edi,[80+edi] - jmp NEAR L$069xts_dec_done -align 16 -L$065xts_dec_one: - movups xmm2,[esi] - lea esi,[16+esi] - xorps xmm2,xmm5 - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$070dec1_loop_12: -db 102,15,56,222,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$070dec1_loop_12 -db 102,15,56,223,209 - xorps xmm2,xmm5 - movups [edi],xmm2 - lea edi,[16+edi] - movdqa xmm1,xmm5 - jmp NEAR L$069xts_dec_done -align 16 -L$066xts_dec_two: - movaps xmm6,xmm1 - movups xmm2,[esi] - movups xmm3,[16+esi] - lea esi,[32+esi] - xorps xmm2,xmm5 - xorps xmm3,xmm6 - call __aesni_decrypt2 - xorps xmm2,xmm5 - xorps xmm3,xmm6 - movups [edi],xmm2 - movups [16+edi],xmm3 - lea edi,[32+edi] - movdqa xmm1,xmm6 - jmp NEAR L$069xts_dec_done -align 16 -L$067xts_dec_three: - movaps xmm7,xmm1 - movups xmm2,[esi] - movups xmm3,[16+esi] - movups xmm4,[32+esi] - lea esi,[48+esi] - xorps xmm2,xmm5 - xorps xmm3,xmm6 - xorps xmm4,xmm7 - call __aesni_decrypt3 - xorps xmm2,xmm5 - xorps xmm3,xmm6 - xorps xmm4,xmm7 - movups [edi],xmm2 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - lea edi,[48+edi] - movdqa xmm1,xmm7 - jmp NEAR L$069xts_dec_done -align 16 -L$068xts_dec_four: - movaps xmm6,xmm1 - movups xmm2,[esi] - movups xmm3,[16+esi] - movups xmm4,[32+esi] - xorps xmm2,[esp] - movups xmm5,[48+esi] - lea esi,[64+esi] - xorps xmm3,[16+esp] - xorps xmm4,xmm7 - xorps xmm5,xmm6 - call __aesni_decrypt4 - xorps xmm2,[esp] - xorps xmm3,[16+esp] - xorps xmm4,xmm7 - movups [edi],xmm2 - xorps xmm5,xmm6 - movups [16+edi],xmm3 - movups [32+edi],xmm4 - movups [48+edi],xmm5 - lea edi,[64+edi] - movdqa xmm1,xmm6 - jmp NEAR L$069xts_dec_done -align 16 -L$064xts_dec_done6x: - mov eax,DWORD [112+esp] - and eax,15 - jz NEAR L$071xts_dec_ret - mov DWORD [112+esp],eax - jmp NEAR L$072xts_dec_only_one_more -align 16 -L$069xts_dec_done: - mov eax,DWORD [112+esp] - pxor xmm0,xmm0 - and eax,15 - jz NEAR L$071xts_dec_ret - pcmpgtd xmm0,xmm1 - mov DWORD [112+esp],eax - pshufd xmm2,xmm0,19 - pxor xmm0,xmm0 - movdqa xmm3,[96+esp] - paddq xmm1,xmm1 - pand xmm2,xmm3 - pcmpgtd xmm0,xmm1 - pxor xmm1,xmm2 -L$072xts_dec_only_one_more: - pshufd xmm5,xmm0,19 - movdqa xmm6,xmm1 - paddq xmm1,xmm1 - pand xmm5,xmm3 - pxor xmm5,xmm1 - mov edx,ebp - mov ecx,ebx - movups xmm2,[esi] - xorps xmm2,xmm5 - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$073dec1_loop_13: -db 102,15,56,222,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$073dec1_loop_13 -db 102,15,56,223,209 - xorps xmm2,xmm5 - movups [edi],xmm2 -L$074xts_dec_steal: - movzx ecx,BYTE [16+esi] - movzx edx,BYTE [edi] - lea esi,[1+esi] - mov BYTE [edi],cl - mov BYTE [16+edi],dl - lea edi,[1+edi] - sub eax,1 - jnz NEAR L$074xts_dec_steal - sub edi,DWORD [112+esp] - mov edx,ebp - mov ecx,ebx - movups xmm2,[edi] - xorps xmm2,xmm6 - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$075dec1_loop_14: -db 102,15,56,222,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$075dec1_loop_14 -db 102,15,56,223,209 - xorps xmm2,xmm6 - movups [edi],xmm2 -L$071xts_dec_ret: - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - movdqa [esp],xmm0 - pxor xmm3,xmm3 - movdqa [16+esp],xmm0 - pxor xmm4,xmm4 - movdqa [32+esp],xmm0 - pxor xmm5,xmm5 - movdqa [48+esp],xmm0 - pxor xmm6,xmm6 - movdqa [64+esp],xmm0 - pxor xmm7,xmm7 - movdqa [80+esp],xmm0 - mov esp,DWORD [116+esp] - pop edi - pop esi - pop ebx - pop ebp - ret -global _aes_hw_cbc_encrypt -align 16 -_aes_hw_cbc_encrypt: -L$_aes_hw_cbc_encrypt_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [20+esp] - mov ebx,esp - mov edi,DWORD [24+esp] - sub ebx,24 - mov eax,DWORD [28+esp] - and ebx,-16 - mov edx,DWORD [32+esp] - mov ebp,DWORD [36+esp] - test eax,eax - jz NEAR L$076cbc_abort - cmp DWORD [40+esp],0 - xchg ebx,esp - movups xmm7,[ebp] - mov ecx,DWORD [240+edx] - mov ebp,edx - mov DWORD [16+esp],ebx - mov ebx,ecx - je NEAR L$077cbc_decrypt - movaps xmm2,xmm7 - cmp eax,16 - jb NEAR L$078cbc_enc_tail - sub eax,16 - jmp NEAR L$079cbc_enc_loop -align 16 -L$079cbc_enc_loop: - movups xmm7,[esi] - lea esi,[16+esi] - movups xmm0,[edx] - movups xmm1,[16+edx] - xorps xmm7,xmm0 - lea edx,[32+edx] - xorps xmm2,xmm7 -L$080enc1_loop_15: -db 102,15,56,220,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$080enc1_loop_15 -db 102,15,56,221,209 - mov ecx,ebx - mov edx,ebp - movups [edi],xmm2 - lea edi,[16+edi] - sub eax,16 - jnc NEAR L$079cbc_enc_loop - add eax,16 - jnz NEAR L$078cbc_enc_tail - movaps xmm7,xmm2 - pxor xmm2,xmm2 - jmp NEAR L$081cbc_ret -L$078cbc_enc_tail: - mov ecx,eax -dd 2767451785 - mov ecx,16 - sub ecx,eax - xor eax,eax -dd 2868115081 - lea edi,[edi-16] - mov ecx,ebx - mov esi,edi - mov edx,ebp - jmp NEAR L$079cbc_enc_loop -align 16 -L$077cbc_decrypt: - cmp eax,80 - jbe NEAR L$082cbc_dec_tail - movaps [esp],xmm7 - sub eax,80 - jmp NEAR L$083cbc_dec_loop6_enter -align 16 -L$084cbc_dec_loop6: - movaps [esp],xmm0 - movups [edi],xmm7 - lea edi,[16+edi] -L$083cbc_dec_loop6_enter: - movdqu xmm2,[esi] - movdqu xmm3,[16+esi] - movdqu xmm4,[32+esi] - movdqu xmm5,[48+esi] - movdqu xmm6,[64+esi] - movdqu xmm7,[80+esi] - call __aesni_decrypt6 - movups xmm1,[esi] - movups xmm0,[16+esi] - xorps xmm2,[esp] - xorps xmm3,xmm1 - movups xmm1,[32+esi] - xorps xmm4,xmm0 - movups xmm0,[48+esi] - xorps xmm5,xmm1 - movups xmm1,[64+esi] - xorps xmm6,xmm0 - movups xmm0,[80+esi] - xorps xmm7,xmm1 - movups [edi],xmm2 - movups [16+edi],xmm3 - lea esi,[96+esi] - movups [32+edi],xmm4 - mov ecx,ebx - movups [48+edi],xmm5 - mov edx,ebp - movups [64+edi],xmm6 - lea edi,[80+edi] - sub eax,96 - ja NEAR L$084cbc_dec_loop6 - movaps xmm2,xmm7 - movaps xmm7,xmm0 - add eax,80 - jle NEAR L$085cbc_dec_clear_tail_collected - movups [edi],xmm2 - lea edi,[16+edi] -L$082cbc_dec_tail: - movups xmm2,[esi] - movaps xmm6,xmm2 - cmp eax,16 - jbe NEAR L$086cbc_dec_one - movups xmm3,[16+esi] - movaps xmm5,xmm3 - cmp eax,32 - jbe NEAR L$087cbc_dec_two - movups xmm4,[32+esi] - cmp eax,48 - jbe NEAR L$088cbc_dec_three - movups xmm5,[48+esi] - cmp eax,64 - jbe NEAR L$089cbc_dec_four - movups xmm6,[64+esi] - movaps [esp],xmm7 - movups xmm2,[esi] - xorps xmm7,xmm7 - call __aesni_decrypt6 - movups xmm1,[esi] - movups xmm0,[16+esi] - xorps xmm2,[esp] - xorps xmm3,xmm1 - movups xmm1,[32+esi] - xorps xmm4,xmm0 - movups xmm0,[48+esi] - xorps xmm5,xmm1 - movups xmm7,[64+esi] - xorps xmm6,xmm0 - movups [edi],xmm2 - movups [16+edi],xmm3 - pxor xmm3,xmm3 - movups [32+edi],xmm4 - pxor xmm4,xmm4 - movups [48+edi],xmm5 - pxor xmm5,xmm5 - lea edi,[64+edi] - movaps xmm2,xmm6 - pxor xmm6,xmm6 - sub eax,80 - jmp NEAR L$090cbc_dec_tail_collected -align 16 -L$086cbc_dec_one: - movups xmm0,[edx] - movups xmm1,[16+edx] - lea edx,[32+edx] - xorps xmm2,xmm0 -L$091dec1_loop_16: -db 102,15,56,222,209 - dec ecx - movups xmm1,[edx] - lea edx,[16+edx] - jnz NEAR L$091dec1_loop_16 -db 102,15,56,223,209 - xorps xmm2,xmm7 - movaps xmm7,xmm6 - sub eax,16 - jmp NEAR L$090cbc_dec_tail_collected -align 16 -L$087cbc_dec_two: - call __aesni_decrypt2 - xorps xmm2,xmm7 - xorps xmm3,xmm6 - movups [edi],xmm2 - movaps xmm2,xmm3 - pxor xmm3,xmm3 - lea edi,[16+edi] - movaps xmm7,xmm5 - sub eax,32 - jmp NEAR L$090cbc_dec_tail_collected -align 16 -L$088cbc_dec_three: - call __aesni_decrypt3 - xorps xmm2,xmm7 - xorps xmm3,xmm6 - xorps xmm4,xmm5 - movups [edi],xmm2 - movaps xmm2,xmm4 - pxor xmm4,xmm4 - movups [16+edi],xmm3 - pxor xmm3,xmm3 - lea edi,[32+edi] - movups xmm7,[32+esi] - sub eax,48 - jmp NEAR L$090cbc_dec_tail_collected -align 16 -L$089cbc_dec_four: - call __aesni_decrypt4 - movups xmm1,[16+esi] - movups xmm0,[32+esi] - xorps xmm2,xmm7 - movups xmm7,[48+esi] - xorps xmm3,xmm6 - movups [edi],xmm2 - xorps xmm4,xmm1 - movups [16+edi],xmm3 - pxor xmm3,xmm3 - xorps xmm5,xmm0 - movups [32+edi],xmm4 - pxor xmm4,xmm4 - lea edi,[48+edi] - movaps xmm2,xmm5 - pxor xmm5,xmm5 - sub eax,64 - jmp NEAR L$090cbc_dec_tail_collected -align 16 -L$085cbc_dec_clear_tail_collected: - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - pxor xmm6,xmm6 -L$090cbc_dec_tail_collected: - and eax,15 - jnz NEAR L$092cbc_dec_tail_partial - movups [edi],xmm2 - pxor xmm0,xmm0 - jmp NEAR L$081cbc_ret -align 16 -L$092cbc_dec_tail_partial: - movaps [esp],xmm2 - pxor xmm0,xmm0 - mov ecx,16 - mov esi,esp - sub ecx,eax -dd 2767451785 - movdqa [esp],xmm2 -L$081cbc_ret: - mov esp,DWORD [16+esp] - mov ebp,DWORD [36+esp] - pxor xmm2,xmm2 - pxor xmm1,xmm1 - movups [ebp],xmm7 - pxor xmm7,xmm7 -L$076cbc_abort: - pop edi - pop esi - pop ebx - pop ebp - ret -align 16 -__aesni_set_encrypt_key: - push ebp - push ebx - test eax,eax - jz NEAR L$093bad_pointer - test edx,edx - jz NEAR L$093bad_pointer - call L$094pic -L$094pic: - pop ebx - lea ebx,[(L$key_const-L$094pic)+ebx] - lea ebp,[_OPENSSL_ia32cap_P] - movups xmm0,[eax] - xorps xmm4,xmm4 - mov ebp,DWORD [4+ebp] - lea edx,[16+edx] - and ebp,268437504 - cmp ecx,256 - je NEAR L$09514rounds - cmp ecx,192 - je NEAR L$09612rounds - cmp ecx,128 - jne NEAR L$097bad_keybits -align 16 -L$09810rounds: - cmp ebp,268435456 - je NEAR L$09910rounds_alt - mov ecx,9 - movups [edx-16],xmm0 -db 102,15,58,223,200,1 - call L$100key_128_cold -db 102,15,58,223,200,2 - call L$101key_128 -db 102,15,58,223,200,4 - call L$101key_128 -db 102,15,58,223,200,8 - call L$101key_128 -db 102,15,58,223,200,16 - call L$101key_128 -db 102,15,58,223,200,32 - call L$101key_128 -db 102,15,58,223,200,64 - call L$101key_128 -db 102,15,58,223,200,128 - call L$101key_128 -db 102,15,58,223,200,27 - call L$101key_128 -db 102,15,58,223,200,54 - call L$101key_128 - movups [edx],xmm0 - mov DWORD [80+edx],ecx - jmp NEAR L$102good_key -align 16 -L$101key_128: - movups [edx],xmm0 - lea edx,[16+edx] -L$100key_128_cold: - shufps xmm4,xmm0,16 - xorps xmm0,xmm4 - shufps xmm4,xmm0,140 - xorps xmm0,xmm4 - shufps xmm1,xmm1,255 - xorps xmm0,xmm1 - ret -align 16 -L$09910rounds_alt: - movdqa xmm5,[ebx] - mov ecx,8 - movdqa xmm4,[32+ebx] - movdqa xmm2,xmm0 - movdqu [edx-16],xmm0 -L$103loop_key128: -db 102,15,56,0,197 -db 102,15,56,221,196 - pslld xmm4,1 - lea edx,[16+edx] - movdqa xmm3,xmm2 - pslldq xmm2,4 - pxor xmm3,xmm2 - pslldq xmm2,4 - pxor xmm3,xmm2 - pslldq xmm2,4 - pxor xmm2,xmm3 - pxor xmm0,xmm2 - movdqu [edx-16],xmm0 - movdqa xmm2,xmm0 - dec ecx - jnz NEAR L$103loop_key128 - movdqa xmm4,[48+ebx] -db 102,15,56,0,197 -db 102,15,56,221,196 - pslld xmm4,1 - movdqa xmm3,xmm2 - pslldq xmm2,4 - pxor xmm3,xmm2 - pslldq xmm2,4 - pxor xmm3,xmm2 - pslldq xmm2,4 - pxor xmm2,xmm3 - pxor xmm0,xmm2 - movdqu [edx],xmm0 - movdqa xmm2,xmm0 -db 102,15,56,0,197 -db 102,15,56,221,196 - movdqa xmm3,xmm2 - pslldq xmm2,4 - pxor xmm3,xmm2 - pslldq xmm2,4 - pxor xmm3,xmm2 - pslldq xmm2,4 - pxor xmm2,xmm3 - pxor xmm0,xmm2 - movdqu [16+edx],xmm0 - mov ecx,9 - mov DWORD [96+edx],ecx - jmp NEAR L$102good_key -align 16 -L$09612rounds: - movq xmm2,[16+eax] - cmp ebp,268435456 - je NEAR L$10412rounds_alt - mov ecx,11 - movups [edx-16],xmm0 -db 102,15,58,223,202,1 - call L$105key_192a_cold -db 102,15,58,223,202,2 - call L$106key_192b -db 102,15,58,223,202,4 - call L$107key_192a -db 102,15,58,223,202,8 - call L$106key_192b -db 102,15,58,223,202,16 - call L$107key_192a -db 102,15,58,223,202,32 - call L$106key_192b -db 102,15,58,223,202,64 - call L$107key_192a -db 102,15,58,223,202,128 - call L$106key_192b - movups [edx],xmm0 - mov DWORD [48+edx],ecx - jmp NEAR L$102good_key -align 16 -L$107key_192a: - movups [edx],xmm0 - lea edx,[16+edx] -align 16 -L$105key_192a_cold: - movaps xmm5,xmm2 -L$108key_192b_warm: - shufps xmm4,xmm0,16 - movdqa xmm3,xmm2 - xorps xmm0,xmm4 - shufps xmm4,xmm0,140 - pslldq xmm3,4 - xorps xmm0,xmm4 - pshufd xmm1,xmm1,85 - pxor xmm2,xmm3 - pxor xmm0,xmm1 - pshufd xmm3,xmm0,255 - pxor xmm2,xmm3 - ret -align 16 -L$106key_192b: - movaps xmm3,xmm0 - shufps xmm5,xmm0,68 - movups [edx],xmm5 - shufps xmm3,xmm2,78 - movups [16+edx],xmm3 - lea edx,[32+edx] - jmp NEAR L$108key_192b_warm -align 16 -L$10412rounds_alt: - movdqa xmm5,[16+ebx] - movdqa xmm4,[32+ebx] - mov ecx,8 - movdqu [edx-16],xmm0 -L$109loop_key192: - movq [edx],xmm2 - movdqa xmm1,xmm2 -db 102,15,56,0,213 -db 102,15,56,221,212 - pslld xmm4,1 - lea edx,[24+edx] - movdqa xmm3,xmm0 - pslldq xmm0,4 - pxor xmm3,xmm0 - pslldq xmm0,4 - pxor xmm3,xmm0 - pslldq xmm0,4 - pxor xmm0,xmm3 - pshufd xmm3,xmm0,255 - pxor xmm3,xmm1 - pslldq xmm1,4 - pxor xmm3,xmm1 - pxor xmm0,xmm2 - pxor xmm2,xmm3 - movdqu [edx-16],xmm0 - dec ecx - jnz NEAR L$109loop_key192 - mov ecx,11 - mov DWORD [32+edx],ecx - jmp NEAR L$102good_key -align 16 -L$09514rounds: - movups xmm2,[16+eax] - lea edx,[16+edx] - cmp ebp,268435456 - je NEAR L$11014rounds_alt - mov ecx,13 - movups [edx-32],xmm0 - movups [edx-16],xmm2 -db 102,15,58,223,202,1 - call L$111key_256a_cold -db 102,15,58,223,200,1 - call L$112key_256b -db 102,15,58,223,202,2 - call L$113key_256a -db 102,15,58,223,200,2 - call L$112key_256b -db 102,15,58,223,202,4 - call L$113key_256a -db 102,15,58,223,200,4 - call L$112key_256b -db 102,15,58,223,202,8 - call L$113key_256a -db 102,15,58,223,200,8 - call L$112key_256b -db 102,15,58,223,202,16 - call L$113key_256a -db 102,15,58,223,200,16 - call L$112key_256b -db 102,15,58,223,202,32 - call L$113key_256a -db 102,15,58,223,200,32 - call L$112key_256b -db 102,15,58,223,202,64 - call L$113key_256a - movups [edx],xmm0 - mov DWORD [16+edx],ecx - xor eax,eax - jmp NEAR L$102good_key -align 16 -L$113key_256a: - movups [edx],xmm2 - lea edx,[16+edx] -L$111key_256a_cold: - shufps xmm4,xmm0,16 - xorps xmm0,xmm4 - shufps xmm4,xmm0,140 - xorps xmm0,xmm4 - shufps xmm1,xmm1,255 - xorps xmm0,xmm1 - ret -align 16 -L$112key_256b: - movups [edx],xmm0 - lea edx,[16+edx] - shufps xmm4,xmm2,16 - xorps xmm2,xmm4 - shufps xmm4,xmm2,140 - xorps xmm2,xmm4 - shufps xmm1,xmm1,170 - xorps xmm2,xmm1 - ret -align 16 -L$11014rounds_alt: - movdqa xmm5,[ebx] - movdqa xmm4,[32+ebx] - mov ecx,7 - movdqu [edx-32],xmm0 - movdqa xmm1,xmm2 - movdqu [edx-16],xmm2 -L$114loop_key256: -db 102,15,56,0,213 -db 102,15,56,221,212 - movdqa xmm3,xmm0 - pslldq xmm0,4 - pxor xmm3,xmm0 - pslldq xmm0,4 - pxor xmm3,xmm0 - pslldq xmm0,4 - pxor xmm0,xmm3 - pslld xmm4,1 - pxor xmm0,xmm2 - movdqu [edx],xmm0 - dec ecx - jz NEAR L$115done_key256 - pshufd xmm2,xmm0,255 - pxor xmm3,xmm3 -db 102,15,56,221,211 - movdqa xmm3,xmm1 - pslldq xmm1,4 - pxor xmm3,xmm1 - pslldq xmm1,4 - pxor xmm3,xmm1 - pslldq xmm1,4 - pxor xmm1,xmm3 - pxor xmm2,xmm1 - movdqu [16+edx],xmm2 - lea edx,[32+edx] - movdqa xmm1,xmm2 - jmp NEAR L$114loop_key256 -L$115done_key256: - mov ecx,13 - mov DWORD [16+edx],ecx -L$102good_key: - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - xor eax,eax - pop ebx - pop ebp - ret -align 4 -L$093bad_pointer: - mov eax,-1 - pop ebx - pop ebp - ret -align 4 -L$097bad_keybits: - pxor xmm0,xmm0 - mov eax,-2 - pop ebx - pop ebp - ret -global _aes_hw_set_encrypt_key -align 16 -_aes_hw_set_encrypt_key: -L$_aes_hw_set_encrypt_key_begin: -%ifdef BORINGSSL_DISPATCH_TEST - push ebx - push edx - call L$116pic -L$116pic: - pop ebx - lea ebx,[(_BORINGSSL_function_hit+3-L$116pic)+ebx] - mov edx,1 - mov BYTE [ebx],dl - pop edx - pop ebx -%endif - mov eax,DWORD [4+esp] - mov ecx,DWORD [8+esp] - mov edx,DWORD [12+esp] - call __aesni_set_encrypt_key - ret -global _aes_hw_set_decrypt_key -align 16 -_aes_hw_set_decrypt_key: -L$_aes_hw_set_decrypt_key_begin: - mov eax,DWORD [4+esp] - mov ecx,DWORD [8+esp] - mov edx,DWORD [12+esp] - call __aesni_set_encrypt_key - mov edx,DWORD [12+esp] - shl ecx,4 - test eax,eax - jnz NEAR L$117dec_key_ret - lea eax,[16+ecx*1+edx] - movups xmm0,[edx] - movups xmm1,[eax] - movups [eax],xmm0 - movups [edx],xmm1 - lea edx,[16+edx] - lea eax,[eax-16] -L$118dec_key_inverse: - movups xmm0,[edx] - movups xmm1,[eax] -db 102,15,56,219,192 -db 102,15,56,219,201 - lea edx,[16+edx] - lea eax,[eax-16] - movups [16+eax],xmm0 - movups [edx-16],xmm1 - cmp eax,edx - ja NEAR L$118dec_key_inverse - movups xmm0,[edx] -db 102,15,56,219,192 - movups [edx],xmm0 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - xor eax,eax -L$117dec_key_ret: - ret -align 64 -L$key_const: -dd 202313229,202313229,202313229,202313229 -dd 67569157,67569157,67569157,67569157 -dd 1,1,1,1 -dd 27,27,27,27 -db 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 -db 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 -db 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 -db 115,108,46,111,114,103,62,0 -segment .bss -common _OPENSSL_ia32cap_P 16 -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-ios.ios.arm.S deleted file mode 100644 index 9d4c1984..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-ios.ios.arm.S +++ /dev/null @@ -1,808 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__APPLE__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__) -#include - -#if __ARM_MAX_ARCH__>=7 -.text - - -.code 32 -#undef __thumb2__ -.align 5 -Lrcon: -.long 0x01,0x01,0x01,0x01 -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat -.long 0x1b,0x1b,0x1b,0x1b - -.text - -.globl _aes_hw_set_encrypt_key -.private_extern _aes_hw_set_encrypt_key -#ifdef __thumb2__ -.thumb_func _aes_hw_set_encrypt_key -#endif -.align 5 -_aes_hw_set_encrypt_key: -Lenc_key: - mov r3,#-1 - cmp r0,#0 - beq Lenc_key_abort - cmp r2,#0 - beq Lenc_key_abort - mov r3,#-2 - cmp r1,#128 - blt Lenc_key_abort - cmp r1,#256 - bgt Lenc_key_abort - tst r1,#0x3f - bne Lenc_key_abort - - adr r3,Lrcon - cmp r1,#192 - - veor q0,q0,q0 - vld1.8 {q3},[r0]! - mov r1,#8 @ reuse r1 - vld1.32 {q1,q2},[r3]! - - blt Loop128 - beq L192 - b L256 - -.align 4 -Loop128: - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - bne Loop128 - - vld1.32 {q1},[r3] - - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - veor q3,q3,q10 - vst1.32 {q3},[r2] - add r2,r2,#0x50 - - mov r12,#10 - b Ldone - -.align 4 -L192: - vld1.8 {d16},[r0]! - vmov.i8 q10,#8 @ borrow q10 - vst1.32 {q3},[r2]! - vsub.i8 q2,q2,q10 @ adjust the mask - -Loop192: - vtbl.8 d20,{q8},d4 - vtbl.8 d21,{q8},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {d16},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - - vdup.32 q9,d7[1] - veor q9,q9,q8 - veor q10,q10,q1 - vext.8 q8,q0,q8,#12 - vshl.u8 q1,q1,#1 - veor q8,q8,q9 - veor q3,q3,q10 - veor q8,q8,q10 - vst1.32 {q3},[r2]! - bne Loop192 - - mov r12,#12 - add r2,r2,#0x20 - b Ldone - -.align 4 -L256: - vld1.8 {q8},[r0] - mov r1,#7 - mov r12,#14 - vst1.32 {q3},[r2]! - -Loop256: - vtbl.8 d20,{q8},d4 - vtbl.8 d21,{q8},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q8},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - vst1.32 {q3},[r2]! - beq Ldone - - vdup.32 q10,d7[1] - vext.8 q9,q0,q8,#12 -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q8,q8,q9 - vext.8 q9,q0,q9,#12 - veor q8,q8,q9 - vext.8 q9,q0,q9,#12 - veor q8,q8,q9 - - veor q8,q8,q10 - b Loop256 - -Ldone: - str r12,[r2] - mov r3,#0 - -Lenc_key_abort: - mov r0,r3 @ return value - - bx lr - - -.globl _aes_hw_set_decrypt_key -.private_extern _aes_hw_set_decrypt_key -#ifdef __thumb2__ -.thumb_func _aes_hw_set_decrypt_key -#endif -.align 5 -_aes_hw_set_decrypt_key: - stmdb sp!,{r4,lr} - bl Lenc_key - - cmp r0,#0 - bne Ldec_key_abort - - sub r2,r2,#240 @ restore original r2 - mov r4,#-16 - add r0,r2,r12,lsl#4 @ end of key schedule - - vld1.32 {q0},[r2] - vld1.32 {q1},[r0] - vst1.32 {q0},[r0],r4 - vst1.32 {q1},[r2]! - -Loop_imc: - vld1.32 {q0},[r2] - vld1.32 {q1},[r0] -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - vst1.32 {q0},[r0],r4 - vst1.32 {q1},[r2]! - cmp r0,r2 - bhi Loop_imc - - vld1.32 {q0},[r2] -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - vst1.32 {q0},[r0] - - eor r0,r0,r0 @ return value -Ldec_key_abort: - ldmia sp!,{r4,pc} - -.globl _aes_hw_encrypt -.private_extern _aes_hw_encrypt -#ifdef __thumb2__ -.thumb_func _aes_hw_encrypt -#endif -.align 5 -_aes_hw_encrypt: - AARCH64_VALID_CALL_TARGET - ldr r3,[r2,#240] - vld1.32 {q0},[r2]! - vld1.8 {q2},[r0] - sub r3,r3,#2 - vld1.32 {q1},[r2]! - -Loop_enc: -.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - vld1.32 {q0},[r2]! - subs r3,r3,#2 -.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - vld1.32 {q1},[r2]! - bgt Loop_enc - -.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - vld1.32 {q0},[r2] -.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 - veor q2,q2,q0 - - vst1.8 {q2},[r1] - bx lr - -.globl _aes_hw_decrypt -.private_extern _aes_hw_decrypt -#ifdef __thumb2__ -.thumb_func _aes_hw_decrypt -#endif -.align 5 -_aes_hw_decrypt: - AARCH64_VALID_CALL_TARGET - ldr r3,[r2,#240] - vld1.32 {q0},[r2]! - vld1.8 {q2},[r0] - sub r3,r3,#2 - vld1.32 {q1},[r2]! - -Loop_dec: -.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - vld1.32 {q0},[r2]! - subs r3,r3,#2 -.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - vld1.32 {q1},[r2]! - bgt Loop_dec - -.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - vld1.32 {q0},[r2] -.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 - veor q2,q2,q0 - - vst1.8 {q2},[r1] - bx lr - -.globl _aes_hw_cbc_encrypt -.private_extern _aes_hw_cbc_encrypt -#ifdef __thumb2__ -.thumb_func _aes_hw_cbc_encrypt -#endif -.align 5 -_aes_hw_cbc_encrypt: - mov ip,sp - stmdb sp!,{r4,r5,r6,r7,r8,lr} - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - ldmia ip,{r4,r5} @ load remaining args - subs r2,r2,#16 - mov r8,#16 - blo Lcbc_abort - moveq r8,#0 - - cmp r5,#0 @ en- or decrypting? - ldr r5,[r3,#240] - and r2,r2,#-16 - vld1.8 {q6},[r4] - vld1.8 {q0},[r0],r8 - - vld1.32 {q8,q9},[r3] @ load key schedule... - sub r5,r5,#6 - add r7,r3,r5,lsl#4 @ pointer to last 7 round keys - sub r5,r5,#2 - vld1.32 {q10,q11},[r7]! - vld1.32 {q12,q13},[r7]! - vld1.32 {q14,q15},[r7]! - vld1.32 {q7},[r7] - - add r7,r3,#32 - mov r6,r5 - beq Lcbc_dec - - cmp r5,#2 - veor q0,q0,q6 - veor q5,q8,q7 - beq Lcbc_enc128 - - vld1.32 {q2,q3},[r7] - add r7,r3,#16 - add r6,r3,#16*4 - add r12,r3,#16*5 -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - add r14,r3,#16*6 - add r3,r3,#16*7 - b Lenter_cbc_enc - -.align 4 -Loop_cbc_enc: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vst1.8 {q6},[r1]! -Lenter_cbc_enc: -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q8},[r6] - cmp r5,#4 -.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r12] - beq Lcbc_enc192 - -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q8},[r14] -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r3] - nop - -Lcbc_enc192: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r2,r2,#16 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - moveq r8,#0 -.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.8 {q8},[r0],r8 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - veor q8,q8,q5 -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r7] @ re-pre-load rndkey[1] -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - veor q6,q0,q7 - bhs Loop_cbc_enc - - vst1.8 {q6},[r1]! - b Lcbc_done - -.align 5 -Lcbc_enc128: - vld1.32 {q2,q3},[r7] -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - b Lenter_cbc_enc128 -Loop_cbc_enc128: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vst1.8 {q6},[r1]! -Lenter_cbc_enc128: -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r2,r2,#16 -.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - moveq r8,#0 -.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.8 {q8},[r0],r8 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - veor q8,q8,q5 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - veor q6,q0,q7 - bhs Loop_cbc_enc128 - - vst1.8 {q6},[r1]! - b Lcbc_done -.align 5 -Lcbc_dec: - vld1.8 {q10},[r0]! - subs r2,r2,#32 @ bias - add r6,r5,#2 - vorr q3,q0,q0 - vorr q1,q0,q0 - vorr q11,q10,q10 - blo Lcbc_dec_tail - - vorr q1,q10,q10 - vld1.8 {q10},[r0]! - vorr q2,q0,q0 - vorr q3,q1,q1 - vorr q11,q10,q10 - -Loop3x_cbc_dec: -.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q9},[r7]! - bgt Loop3x_cbc_dec - -.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q4,q6,q7 - subs r2,r2,#0x30 - veor q5,q2,q7 - movlo r6,r2 @ r6, r6, is zero at this point -.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q9,q3,q7 - add r0,r0,r6 @ r0 is adjusted in such way that - @ at exit from the loop q1-q10 - @ are loaded with last "words" - vorr q6,q11,q11 - mov r7,r3 -.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.8 {q2},[r0]! -.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.8 {q3},[r0]! -.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.8 {q11},[r0]! -.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 -.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 -.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] - add r6,r5,#2 - veor q4,q4,q0 - veor q5,q5,q1 - veor q10,q10,q9 - vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - vst1.8 {q4},[r1]! - vorr q0,q2,q2 - vst1.8 {q5},[r1]! - vorr q1,q3,q3 - vst1.8 {q10},[r1]! - vorr q10,q11,q11 - bhs Loop3x_cbc_dec - - cmn r2,#0x30 - beq Lcbc_done - nop - -Lcbc_dec_tail: -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q9},[r7]! - bgt Lcbc_dec_tail - -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 -.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - cmn r2,#0x20 -.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q5,q6,q7 -.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q9,q3,q7 -.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 -.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 - beq Lcbc_dec_one - veor q5,q5,q1 - veor q9,q9,q10 - vorr q6,q11,q11 - vst1.8 {q5},[r1]! - vst1.8 {q9},[r1]! - b Lcbc_done - -Lcbc_dec_one: - veor q5,q5,q10 - vorr q6,q11,q11 - vst1.8 {q5},[r1]! - -Lcbc_done: - vst1.8 {q6},[r4] -Lcbc_abort: - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!,{r4,r5,r6,r7,r8,pc} - -.globl _aes_hw_ctr32_encrypt_blocks -.private_extern _aes_hw_ctr32_encrypt_blocks -#ifdef __thumb2__ -.thumb_func _aes_hw_ctr32_encrypt_blocks -#endif -.align 5 -_aes_hw_ctr32_encrypt_blocks: - mov ip,sp - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr} - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - ldr r4, [ip] @ load remaining arg - ldr r5,[r3,#240] - - ldr r8, [r4, #12] - vld1.32 {q0},[r4] - - vld1.32 {q8,q9},[r3] @ load key schedule... - sub r5,r5,#4 - mov r12,#16 - cmp r2,#2 - add r7,r3,r5,lsl#4 @ pointer to last 5 round keys - sub r5,r5,#2 - vld1.32 {q12,q13},[r7]! - vld1.32 {q14,q15},[r7]! - vld1.32 {q7},[r7] - add r7,r3,#32 - mov r6,r5 - movlo r12,#0 - - @ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are - @ affected by silicon errata #1742098 [0] and #1655431 [1], - @ respectively, where the second instruction of an aese/aesmc - @ instruction pair may execute twice if an interrupt is taken right - @ after the first instruction consumes an input register of which a - @ single 32-bit lane has been updated the last time it was modified. - @ - @ This function uses a counter in one 32-bit lane. The - @ could write to q1 and q10 directly, but that trips this bugs. - @ We write to q6 and copy to the final register as a workaround. - @ - @ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice - @ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice -#ifndef __ARMEB__ - rev r8, r8 -#endif - add r10, r8, #1 - vorr q6,q0,q0 - rev r10, r10 - vmov.32 d13[1],r10 - add r8, r8, #2 - vorr q1,q6,q6 - bls Lctr32_tail - rev r12, r8 - vmov.32 d13[1],r12 - sub r2,r2,#3 @ bias - vorr q10,q6,q6 - b Loop3x_ctr32 - -.align 4 -Loop3x_ctr32: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 - vld1.32 {q9},[r7]! - bgt Loop3x_ctr32 - -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 - vld1.8 {q2},[r0]! - add r9,r8,#1 -.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 - vld1.8 {q3},[r0]! - rev r9,r9 -.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - vld1.8 {q11},[r0]! - mov r7,r3 -.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 -.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 -.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - veor q2,q2,q7 - add r10,r8,#2 -.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 - veor q3,q3,q7 - add r8,r8,#3 -.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - @ Note the logic to update q0, q1, and q1 is written to work - @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in - @ 32-bit mode. See the comment above. - veor q11,q11,q7 - vmov.32 d13[1], r9 -.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 - vorr q0,q6,q6 - rev r10,r10 -.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - vmov.32 d13[1], r10 - rev r12,r8 -.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - vorr q1,q6,q6 - vmov.32 d13[1], r12 -.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 - vorr q10,q6,q6 - subs r2,r2,#3 -.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 -.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 -.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 - - veor q2,q2,q4 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] - vst1.8 {q2},[r1]! - veor q3,q3,q5 - mov r6,r5 - vst1.8 {q3},[r1]! - veor q11,q11,q9 - vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - vst1.8 {q11},[r1]! - bhs Loop3x_ctr32 - - adds r2,r2,#3 - beq Lctr32_done - cmp r2,#1 - mov r12,#16 - moveq r12,#0 - -Lctr32_tail: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.32 {q9},[r7]! - bgt Lctr32_tail - -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.8 {q2},[r0],r12 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.8 {q3},[r0] -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - veor q2,q2,q7 -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - veor q3,q3,q7 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 -.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 - - cmp r2,#1 - veor q2,q2,q0 - veor q3,q3,q1 - vst1.8 {q2},[r1]! - beq Lctr32_done - vst1.8 {q3},[r1] - -Lctr32_done: - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} - -#endif -#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__) -#endif // defined(__arm__) && defined(__APPLE__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-ios.ios.arm.S deleted file mode 100644 index be0d2bb9..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-ios.ios.arm.S +++ /dev/null @@ -1,950 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__APPLE__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__) -#include - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. - - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -.globl _bn_mul_mont_nohw -.private_extern _bn_mul_mont_nohw -#ifdef __thumb2__ -.thumb_func _bn_mul_mont_nohw -#endif - -.align 5 -_bn_mul_mont_nohw: - ldr ip,[sp,#4] @ load num - stmdb sp!,{r0,r2} @ sp points at argument block - cmp ip,#2 - mov r0,ip @ load num -#ifdef __thumb2__ - ittt lt -#endif - movlt r0,#0 - addlt sp,sp,#2*4 - blt Labrt - - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers - - mov r0,r0,lsl#2 @ rescale r0 for byte count - sub sp,sp,r0 @ alloca(4*num) - sub sp,sp,#4 @ +extra dword - sub r0,r0,#4 @ "num=num-1" - add r4,r2,r0 @ &bp[num-1] - - add r0,sp,r0 @ r0 to point at &tp[num-1] - ldr r8,[r0,#14*4] @ &n0 - ldr r2,[r2] @ bp[0] - ldr r5,[r1],#4 @ ap[0],ap++ - ldr r6,[r3],#4 @ np[0],np++ - ldr r8,[r8] @ *n0 - str r4,[r0,#15*4] @ save &bp[num] - - umull r10,r11,r5,r2 @ ap[0]*bp[0] - str r8,[r0,#14*4] @ save n0 value - mul r8,r10,r8 @ "tp[0]"*n0 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" - mov r4,sp - -L1st: - ldr r5,[r1],#4 @ ap[j],ap++ - mov r10,r11 - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[0] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne L1st - - adds r12,r12,r11 - ldr r4,[r0,#13*4] @ restore bp - mov r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - mov r7,sp - str r14,[r0,#4] @ tp[num]= - -Louter: - sub r7,r0,r7 @ "original" r0-1 value - sub r1,r1,r7 @ "rewind" ap to &ap[1] - ldr r2,[r4,#4]! @ *(++bp) - sub r3,r3,r7 @ "rewind" np to &np[1] - ldr r5,[r1,#-4] @ ap[0] - ldr r10,[sp] @ tp[0] - ldr r6,[r3,#-4] @ np[0] - ldr r7,[sp,#4] @ tp[1] - - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] - str r4,[r0,#13*4] @ save bp - mul r8,r10,r8 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" - mov r4,sp - -Linner: - ldr r5,[r1],#4 @ ap[j],ap++ - adds r10,r11,r7 @ +=tp[j] - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[i] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adc r11,r11,#0 - ldr r7,[r4,#8] @ tp[j+1] - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne Linner - - adds r12,r12,r11 - mov r14,#0 - ldr r4,[r0,#13*4] @ restore bp - adc r14,r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adds r12,r12,r7 - ldr r7,[r0,#15*4] @ restore &bp[num] - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - str r14,[r0,#4] @ tp[num]= - - cmp r4,r7 -#ifdef __thumb2__ - itt ne -#endif - movne r7,sp - bne Louter - - ldr r2,[r0,#12*4] @ pull rp - mov r5,sp - add r0,r0,#4 @ r0 to point at &tp[num] - sub r5,r0,r5 @ "original" num value - mov r4,sp @ "rewind" r4 - mov r1,r4 @ "borrow" r1 - sub r3,r3,r5 @ "rewind" r3 to &np[0] - - subs r7,r7,r7 @ "clear" carry flag -Lsub: ldr r7,[r4],#4 - ldr r6,[r3],#4 - sbcs r7,r7,r6 @ tp[j]-np[j] - str r7,[r2],#4 @ rp[j]= - teq r4,r0 @ preserve carry - bne Lsub - sbcs r14,r14,#0 @ upmost carry - mov r4,sp @ "rewind" r4 - sub r2,r2,r5 @ "rewind" r2 - -Lcopy: ldr r7,[r4] @ conditional copy - ldr r5,[r2] - str sp,[r4],#4 @ zap tp -#ifdef __thumb2__ - it cc -#endif - movcc r5,r7 - str r5,[r2],#4 - teq r4,r0 @ preserve carry - bne Lcopy - - mov sp,r0 - add sp,sp,#4 @ skip over tp[num+1] - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers - add sp,sp,#2*4 @ skip over {r0,r2} - mov r0,#1 -Labrt: -#if __ARM_ARCH>=5 - bx lr @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif - -#if __ARM_MAX_ARCH__>=7 - - - -.globl _bn_mul8x_mont_neon -.private_extern _bn_mul8x_mont_neon -#ifdef __thumb2__ -.thumb_func _bn_mul8x_mont_neon -#endif -.align 5 -_bn_mul8x_mont_neon: - mov ip,sp - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - ldmia ip,{r4,r5} @ load rest of parameter block - mov ip,sp - - cmp r5,#8 - bhi LNEON_8n - - @ special case for r5==8, everything is in register bank... - - vld1.32 {d28[0]}, [r2,:32]! - veor d8,d8,d8 - sub r7,sp,r5,lsl#4 - vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( - and r7,r7,#-64 - vld1.32 {d30[0]}, [r4,:32] - mov sp,r7 @ alloca - vzip.16 d28,d8 - - vmull.u32 q6,d28,d0[0] - vmull.u32 q7,d28,d0[1] - vmull.u32 q8,d28,d1[0] - vshl.i64 d29,d13,#16 - vmull.u32 q9,d28,d1[1] - - vadd.u64 d29,d29,d12 - veor d8,d8,d8 - vmul.u32 d29,d29,d30 - - vmull.u32 q10,d28,d2[0] - vld1.32 {d4,d5,d6,d7}, [r3]! - vmull.u32 q11,d28,d2[1] - vmull.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmull.u32 q13,d28,d3[1] - - vmlal.u32 q6,d29,d4[0] - sub r9,r5,#1 - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vmov q5,q6 - vmlal.u32 q11,d29,d6[1] - vmov q6,q7 - vmlal.u32 q12,d29,d7[0] - vmov q7,q8 - vmlal.u32 q13,d29,d7[1] - vmov q8,q9 - vmov q9,q10 - vshr.u64 d10,d10,#16 - vmov q10,q11 - vmov q11,q12 - vadd.u64 d10,d10,d11 - vmov q12,q13 - veor q13,q13 - vshr.u64 d10,d10,#16 - - b LNEON_outer8 - -.align 4 -LNEON_outer8: - vld1.32 {d28[0]}, [r2,:32]! - veor d8,d8,d8 - vzip.16 d28,d8 - vadd.u64 d12,d12,d10 - - vmlal.u32 q6,d28,d0[0] - vmlal.u32 q7,d28,d0[1] - vmlal.u32 q8,d28,d1[0] - vshl.i64 d29,d13,#16 - vmlal.u32 q9,d28,d1[1] - - vadd.u64 d29,d29,d12 - veor d8,d8,d8 - subs r9,r9,#1 - vmul.u32 d29,d29,d30 - - vmlal.u32 q10,d28,d2[0] - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q13,d28,d3[1] - - vmlal.u32 q6,d29,d4[0] - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vmov q5,q6 - vmlal.u32 q11,d29,d6[1] - vmov q6,q7 - vmlal.u32 q12,d29,d7[0] - vmov q7,q8 - vmlal.u32 q13,d29,d7[1] - vmov q8,q9 - vmov q9,q10 - vshr.u64 d10,d10,#16 - vmov q10,q11 - vmov q11,q12 - vadd.u64 d10,d10,d11 - vmov q12,q13 - veor q13,q13 - vshr.u64 d10,d10,#16 - - bne LNEON_outer8 - - vadd.u64 d12,d12,d10 - mov r7,sp - vshr.u64 d10,d12,#16 - mov r8,r5 - vadd.u64 d13,d13,d10 - add r6,sp,#96 - vshr.u64 d10,d13,#16 - vzip.16 d12,d13 - - b LNEON_tail_entry - -.align 4 -LNEON_8n: - veor q6,q6,q6 - sub r7,sp,#128 - veor q7,q7,q7 - sub r7,r7,r5,lsl#4 - veor q8,q8,q8 - and r7,r7,#-64 - veor q9,q9,q9 - mov sp,r7 @ alloca - veor q10,q10,q10 - add r7,r7,#256 - veor q11,q11,q11 - sub r8,r5,#8 - veor q12,q12,q12 - veor q13,q13,q13 - -LNEON_8n_init: - vst1.64 {q6,q7},[r7,:256]! - subs r8,r8,#8 - vst1.64 {q8,q9},[r7,:256]! - vst1.64 {q10,q11},[r7,:256]! - vst1.64 {q12,q13},[r7,:256]! - bne LNEON_8n_init - - add r6,sp,#256 - vld1.32 {d0,d1,d2,d3},[r1]! - add r10,sp,#8 - vld1.32 {d30[0]},[r4,:32] - mov r9,r5 - b LNEON_8n_outer - -.align 4 -LNEON_8n_outer: - vld1.32 {d28[0]},[r2,:32]! @ *b++ - veor d8,d8,d8 - vzip.16 d28,d8 - add r7,sp,#128 - vld1.32 {d4,d5,d6,d7},[r3]! - - vmlal.u32 q6,d28,d0[0] - vmlal.u32 q7,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q8,d28,d1[0] - vshl.i64 d29,d13,#16 - vmlal.u32 q9,d28,d1[1] - vadd.u64 d29,d29,d12 - vmlal.u32 q10,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q11,d28,d2[1] - vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0] - vmlal.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q13,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q6,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q7,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q8,d29,d5[0] - vshr.u64 d12,d12,#16 - vmlal.u32 q9,d29,d5[1] - vmlal.u32 q10,d29,d6[0] - vadd.u64 d12,d12,d13 - vmlal.u32 q11,d29,d6[1] - vshr.u64 d12,d12,#16 - vmlal.u32 q12,d29,d7[0] - vmlal.u32 q13,d29,d7[1] - vadd.u64 d14,d14,d12 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0] - vmlal.u32 q7,d28,d0[0] - vld1.64 {q6},[r6,:128]! - vmlal.u32 q8,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q9,d28,d1[0] - vshl.i64 d29,d15,#16 - vmlal.u32 q10,d28,d1[1] - vadd.u64 d29,d29,d14 - vmlal.u32 q11,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q12,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1] - vmlal.u32 q13,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q6,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q7,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q8,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q9,d29,d5[0] - vshr.u64 d14,d14,#16 - vmlal.u32 q10,d29,d5[1] - vmlal.u32 q11,d29,d6[0] - vadd.u64 d14,d14,d15 - vmlal.u32 q12,d29,d6[1] - vshr.u64 d14,d14,#16 - vmlal.u32 q13,d29,d7[0] - vmlal.u32 q6,d29,d7[1] - vadd.u64 d16,d16,d14 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1] - vmlal.u32 q8,d28,d0[0] - vld1.64 {q7},[r6,:128]! - vmlal.u32 q9,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q10,d28,d1[0] - vshl.i64 d29,d17,#16 - vmlal.u32 q11,d28,d1[1] - vadd.u64 d29,d29,d16 - vmlal.u32 q12,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q13,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2] - vmlal.u32 q6,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q7,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q8,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q9,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q10,d29,d5[0] - vshr.u64 d16,d16,#16 - vmlal.u32 q11,d29,d5[1] - vmlal.u32 q12,d29,d6[0] - vadd.u64 d16,d16,d17 - vmlal.u32 q13,d29,d6[1] - vshr.u64 d16,d16,#16 - vmlal.u32 q6,d29,d7[0] - vmlal.u32 q7,d29,d7[1] - vadd.u64 d18,d18,d16 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2] - vmlal.u32 q9,d28,d0[0] - vld1.64 {q8},[r6,:128]! - vmlal.u32 q10,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q11,d28,d1[0] - vshl.i64 d29,d19,#16 - vmlal.u32 q12,d28,d1[1] - vadd.u64 d29,d29,d18 - vmlal.u32 q13,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q6,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3] - vmlal.u32 q7,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q8,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q9,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q10,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q11,d29,d5[0] - vshr.u64 d18,d18,#16 - vmlal.u32 q12,d29,d5[1] - vmlal.u32 q13,d29,d6[0] - vadd.u64 d18,d18,d19 - vmlal.u32 q6,d29,d6[1] - vshr.u64 d18,d18,#16 - vmlal.u32 q7,d29,d7[0] - vmlal.u32 q8,d29,d7[1] - vadd.u64 d20,d20,d18 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3] - vmlal.u32 q10,d28,d0[0] - vld1.64 {q9},[r6,:128]! - vmlal.u32 q11,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q12,d28,d1[0] - vshl.i64 d29,d21,#16 - vmlal.u32 q13,d28,d1[1] - vadd.u64 d29,d29,d20 - vmlal.u32 q6,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q7,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4] - vmlal.u32 q8,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q9,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q10,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q11,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q12,d29,d5[0] - vshr.u64 d20,d20,#16 - vmlal.u32 q13,d29,d5[1] - vmlal.u32 q6,d29,d6[0] - vadd.u64 d20,d20,d21 - vmlal.u32 q7,d29,d6[1] - vshr.u64 d20,d20,#16 - vmlal.u32 q8,d29,d7[0] - vmlal.u32 q9,d29,d7[1] - vadd.u64 d22,d22,d20 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4] - vmlal.u32 q11,d28,d0[0] - vld1.64 {q10},[r6,:128]! - vmlal.u32 q12,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q13,d28,d1[0] - vshl.i64 d29,d23,#16 - vmlal.u32 q6,d28,d1[1] - vadd.u64 d29,d29,d22 - vmlal.u32 q7,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q8,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5] - vmlal.u32 q9,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q10,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q11,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q12,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q13,d29,d5[0] - vshr.u64 d22,d22,#16 - vmlal.u32 q6,d29,d5[1] - vmlal.u32 q7,d29,d6[0] - vadd.u64 d22,d22,d23 - vmlal.u32 q8,d29,d6[1] - vshr.u64 d22,d22,#16 - vmlal.u32 q9,d29,d7[0] - vmlal.u32 q10,d29,d7[1] - vadd.u64 d24,d24,d22 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5] - vmlal.u32 q12,d28,d0[0] - vld1.64 {q11},[r6,:128]! - vmlal.u32 q13,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q6,d28,d1[0] - vshl.i64 d29,d25,#16 - vmlal.u32 q7,d28,d1[1] - vadd.u64 d29,d29,d24 - vmlal.u32 q8,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q9,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6] - vmlal.u32 q10,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q11,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q12,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q13,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q6,d29,d5[0] - vshr.u64 d24,d24,#16 - vmlal.u32 q7,d29,d5[1] - vmlal.u32 q8,d29,d6[0] - vadd.u64 d24,d24,d25 - vmlal.u32 q9,d29,d6[1] - vshr.u64 d24,d24,#16 - vmlal.u32 q10,d29,d7[0] - vmlal.u32 q11,d29,d7[1] - vadd.u64 d26,d26,d24 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6] - vmlal.u32 q13,d28,d0[0] - vld1.64 {q12},[r6,:128]! - vmlal.u32 q6,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q7,d28,d1[0] - vshl.i64 d29,d27,#16 - vmlal.u32 q8,d28,d1[1] - vadd.u64 d29,d29,d26 - vmlal.u32 q9,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q10,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7] - vmlal.u32 q11,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q12,d28,d3[1] - vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] - vmlal.u32 q13,d29,d4[0] - vld1.32 {d0,d1,d2,d3},[r1]! - vmlal.u32 q6,d29,d4[1] - vmlal.u32 q7,d29,d5[0] - vshr.u64 d26,d26,#16 - vmlal.u32 q8,d29,d5[1] - vmlal.u32 q9,d29,d6[0] - vadd.u64 d26,d26,d27 - vmlal.u32 q10,d29,d6[1] - vshr.u64 d26,d26,#16 - vmlal.u32 q11,d29,d7[0] - vmlal.u32 q12,d29,d7[1] - vadd.u64 d12,d12,d26 - vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7] - add r10,sp,#8 @ rewind - sub r8,r5,#8 - b LNEON_8n_inner - -.align 4 -LNEON_8n_inner: - subs r8,r8,#8 - vmlal.u32 q6,d28,d0[0] - vld1.64 {q13},[r6,:128] - vmlal.u32 q7,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0] - vmlal.u32 q8,d28,d1[0] - vld1.32 {d4,d5,d6,d7},[r3]! - vmlal.u32 q9,d28,d1[1] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q10,d28,d2[0] - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vmlal.u32 q13,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1] - vmlal.u32 q6,d29,d4[0] - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - vmlal.u32 q10,d29,d6[0] - vmlal.u32 q11,d29,d6[1] - vmlal.u32 q12,d29,d7[0] - vmlal.u32 q13,d29,d7[1] - vst1.64 {q6},[r7,:128]! - vmlal.u32 q7,d28,d0[0] - vld1.64 {q6},[r6,:128] - vmlal.u32 q8,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1] - vmlal.u32 q9,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q10,d28,d1[1] - vmlal.u32 q11,d28,d2[0] - vmlal.u32 q12,d28,d2[1] - vmlal.u32 q13,d28,d3[0] - vmlal.u32 q6,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2] - vmlal.u32 q7,d29,d4[0] - vmlal.u32 q8,d29,d4[1] - vmlal.u32 q9,d29,d5[0] - vmlal.u32 q10,d29,d5[1] - vmlal.u32 q11,d29,d6[0] - vmlal.u32 q12,d29,d6[1] - vmlal.u32 q13,d29,d7[0] - vmlal.u32 q6,d29,d7[1] - vst1.64 {q7},[r7,:128]! - vmlal.u32 q8,d28,d0[0] - vld1.64 {q7},[r6,:128] - vmlal.u32 q9,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2] - vmlal.u32 q10,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q11,d28,d1[1] - vmlal.u32 q12,d28,d2[0] - vmlal.u32 q13,d28,d2[1] - vmlal.u32 q6,d28,d3[0] - vmlal.u32 q7,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3] - vmlal.u32 q8,d29,d4[0] - vmlal.u32 q9,d29,d4[1] - vmlal.u32 q10,d29,d5[0] - vmlal.u32 q11,d29,d5[1] - vmlal.u32 q12,d29,d6[0] - vmlal.u32 q13,d29,d6[1] - vmlal.u32 q6,d29,d7[0] - vmlal.u32 q7,d29,d7[1] - vst1.64 {q8},[r7,:128]! - vmlal.u32 q9,d28,d0[0] - vld1.64 {q8},[r6,:128] - vmlal.u32 q10,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3] - vmlal.u32 q11,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q12,d28,d1[1] - vmlal.u32 q13,d28,d2[0] - vmlal.u32 q6,d28,d2[1] - vmlal.u32 q7,d28,d3[0] - vmlal.u32 q8,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4] - vmlal.u32 q9,d29,d4[0] - vmlal.u32 q10,d29,d4[1] - vmlal.u32 q11,d29,d5[0] - vmlal.u32 q12,d29,d5[1] - vmlal.u32 q13,d29,d6[0] - vmlal.u32 q6,d29,d6[1] - vmlal.u32 q7,d29,d7[0] - vmlal.u32 q8,d29,d7[1] - vst1.64 {q9},[r7,:128]! - vmlal.u32 q10,d28,d0[0] - vld1.64 {q9},[r6,:128] - vmlal.u32 q11,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4] - vmlal.u32 q12,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q13,d28,d1[1] - vmlal.u32 q6,d28,d2[0] - vmlal.u32 q7,d28,d2[1] - vmlal.u32 q8,d28,d3[0] - vmlal.u32 q9,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5] - vmlal.u32 q10,d29,d4[0] - vmlal.u32 q11,d29,d4[1] - vmlal.u32 q12,d29,d5[0] - vmlal.u32 q13,d29,d5[1] - vmlal.u32 q6,d29,d6[0] - vmlal.u32 q7,d29,d6[1] - vmlal.u32 q8,d29,d7[0] - vmlal.u32 q9,d29,d7[1] - vst1.64 {q10},[r7,:128]! - vmlal.u32 q11,d28,d0[0] - vld1.64 {q10},[r6,:128] - vmlal.u32 q12,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5] - vmlal.u32 q13,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q6,d28,d1[1] - vmlal.u32 q7,d28,d2[0] - vmlal.u32 q8,d28,d2[1] - vmlal.u32 q9,d28,d3[0] - vmlal.u32 q10,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6] - vmlal.u32 q11,d29,d4[0] - vmlal.u32 q12,d29,d4[1] - vmlal.u32 q13,d29,d5[0] - vmlal.u32 q6,d29,d5[1] - vmlal.u32 q7,d29,d6[0] - vmlal.u32 q8,d29,d6[1] - vmlal.u32 q9,d29,d7[0] - vmlal.u32 q10,d29,d7[1] - vst1.64 {q11},[r7,:128]! - vmlal.u32 q12,d28,d0[0] - vld1.64 {q11},[r6,:128] - vmlal.u32 q13,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6] - vmlal.u32 q6,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q7,d28,d1[1] - vmlal.u32 q8,d28,d2[0] - vmlal.u32 q9,d28,d2[1] - vmlal.u32 q10,d28,d3[0] - vmlal.u32 q11,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7] - vmlal.u32 q12,d29,d4[0] - vmlal.u32 q13,d29,d4[1] - vmlal.u32 q6,d29,d5[0] - vmlal.u32 q7,d29,d5[1] - vmlal.u32 q8,d29,d6[0] - vmlal.u32 q9,d29,d6[1] - vmlal.u32 q10,d29,d7[0] - vmlal.u32 q11,d29,d7[1] - vst1.64 {q12},[r7,:128]! - vmlal.u32 q13,d28,d0[0] - vld1.64 {q12},[r6,:128] - vmlal.u32 q6,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7] - vmlal.u32 q7,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q8,d28,d1[1] - vmlal.u32 q9,d28,d2[0] - vmlal.u32 q10,d28,d2[1] - vmlal.u32 q11,d28,d3[0] - vmlal.u32 q12,d28,d3[1] - it eq - subeq r1,r1,r5,lsl#2 @ rewind - vmlal.u32 q13,d29,d4[0] - vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] - vmlal.u32 q6,d29,d4[1] - vld1.32 {d0,d1,d2,d3},[r1]! - vmlal.u32 q7,d29,d5[0] - add r10,sp,#8 @ rewind - vmlal.u32 q8,d29,d5[1] - vmlal.u32 q9,d29,d6[0] - vmlal.u32 q10,d29,d6[1] - vmlal.u32 q11,d29,d7[0] - vst1.64 {q13},[r7,:128]! - vmlal.u32 q12,d29,d7[1] - - bne LNEON_8n_inner - add r6,sp,#128 - vst1.64 {q6,q7},[r7,:256]! - veor q2,q2,q2 @ d4-d5 - vst1.64 {q8,q9},[r7,:256]! - veor q3,q3,q3 @ d6-d7 - vst1.64 {q10,q11},[r7,:256]! - vst1.64 {q12},[r7,:128] - - subs r9,r9,#8 - vld1.64 {q6,q7},[r6,:256]! - vld1.64 {q8,q9},[r6,:256]! - vld1.64 {q10,q11},[r6,:256]! - vld1.64 {q12,q13},[r6,:256]! - - itt ne - subne r3,r3,r5,lsl#2 @ rewind - bne LNEON_8n_outer - - add r7,sp,#128 - vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame - vshr.u64 d10,d12,#16 - vst1.64 {q2,q3},[sp,:256]! - vadd.u64 d13,d13,d10 - vst1.64 {q2,q3}, [sp,:256]! - vshr.u64 d10,d13,#16 - vst1.64 {q2,q3}, [sp,:256]! - vzip.16 d12,d13 - - mov r8,r5 - b LNEON_tail_entry - -.align 4 -LNEON_tail: - vadd.u64 d12,d12,d10 - vshr.u64 d10,d12,#16 - vld1.64 {q8,q9}, [r6, :256]! - vadd.u64 d13,d13,d10 - vld1.64 {q10,q11}, [r6, :256]! - vshr.u64 d10,d13,#16 - vld1.64 {q12,q13}, [r6, :256]! - vzip.16 d12,d13 - -LNEON_tail_entry: - vadd.u64 d14,d14,d10 - vst1.32 {d12[0]}, [r7, :32]! - vshr.u64 d10,d14,#16 - vadd.u64 d15,d15,d10 - vshr.u64 d10,d15,#16 - vzip.16 d14,d15 - vadd.u64 d16,d16,d10 - vst1.32 {d14[0]}, [r7, :32]! - vshr.u64 d10,d16,#16 - vadd.u64 d17,d17,d10 - vshr.u64 d10,d17,#16 - vzip.16 d16,d17 - vadd.u64 d18,d18,d10 - vst1.32 {d16[0]}, [r7, :32]! - vshr.u64 d10,d18,#16 - vadd.u64 d19,d19,d10 - vshr.u64 d10,d19,#16 - vzip.16 d18,d19 - vadd.u64 d20,d20,d10 - vst1.32 {d18[0]}, [r7, :32]! - vshr.u64 d10,d20,#16 - vadd.u64 d21,d21,d10 - vshr.u64 d10,d21,#16 - vzip.16 d20,d21 - vadd.u64 d22,d22,d10 - vst1.32 {d20[0]}, [r7, :32]! - vshr.u64 d10,d22,#16 - vadd.u64 d23,d23,d10 - vshr.u64 d10,d23,#16 - vzip.16 d22,d23 - vadd.u64 d24,d24,d10 - vst1.32 {d22[0]}, [r7, :32]! - vshr.u64 d10,d24,#16 - vadd.u64 d25,d25,d10 - vshr.u64 d10,d25,#16 - vzip.16 d24,d25 - vadd.u64 d26,d26,d10 - vst1.32 {d24[0]}, [r7, :32]! - vshr.u64 d10,d26,#16 - vadd.u64 d27,d27,d10 - vshr.u64 d10,d27,#16 - vzip.16 d26,d27 - vld1.64 {q6,q7}, [r6, :256]! - subs r8,r8,#8 - vst1.32 {d26[0]}, [r7, :32]! - bne LNEON_tail - - vst1.32 {d10[0]}, [r7, :32] @ top-most bit - sub r3,r3,r5,lsl#2 @ rewind r3 - subs r1,sp,#0 @ clear carry flag - add r2,sp,r5,lsl#2 - -LNEON_sub: - ldmia r1!, {r4,r5,r6,r7} - ldmia r3!, {r8,r9,r10,r11} - sbcs r8, r4,r8 - sbcs r9, r5,r9 - sbcs r10,r6,r10 - sbcs r11,r7,r11 - teq r1,r2 @ preserves carry - stmia r0!, {r8,r9,r10,r11} - bne LNEON_sub - - ldr r10, [r1] @ load top-most bit - mov r11,sp - veor q0,q0,q0 - sub r11,r2,r11 @ this is num*4 - veor q1,q1,q1 - mov r1,sp - sub r0,r0,r11 @ rewind r0 - mov r3,r2 @ second 3/4th of frame - sbcs r10,r10,#0 @ result is carry flag - -LNEON_copy_n_zap: - ldmia r1!, {r4,r5,r6,r7} - ldmia r0, {r8,r9,r10,r11} - it cc - movcc r8, r4 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - itt cc - movcc r9, r5 - movcc r10,r6 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - it cc - movcc r11,r7 - ldmia r1, {r4,r5,r6,r7} - stmia r0!, {r8,r9,r10,r11} - sub r1,r1,#16 - ldmia r0, {r8,r9,r10,r11} - it cc - movcc r8, r4 - vst1.64 {q0,q1}, [r1,:256]! @ wipe - itt cc - movcc r9, r5 - movcc r10,r6 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - it cc - movcc r11,r7 - teq r1,r2 @ preserves carry - stmia r0!, {r8,r9,r10,r11} - bne LNEON_copy_n_zap - - mov sp,ip - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} - bx lr @ bx lr - -#endif -.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__) -#endif // defined(__arm__) && defined(__APPLE__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm.c new file mode 100644 index 00000000..7cc13342 --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm.c @@ -0,0 +1,277 @@ +/* Copyright (c) 2017, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#if !defined(_GNU_SOURCE) +#define _GNU_SOURCE // needed for syscall() on Linux. +#endif + +#include + +#include +#if defined(BORINGSSL_FIPS) +#include +#include +#endif + +#include +#include +#include + +#include "bcm_interface.h" +#include "../internal.h" + +// TODO(crbug.com/362530616): When delocate is removed, build these files as +// separate compilation units again. +#include "aes/aes.c.inc" +#include "aes/aes_nohw.c.inc" +#include "aes/key_wrap.c.inc" +#include "aes/mode_wrappers.c.inc" +#include "bn/add.c.inc" +#include "bn/asm/x86_64-gcc.c.inc" +#include "bn/bn.c.inc" +#include "bn/bytes.c.inc" +#include "bn/cmp.c.inc" +#include "bn/ctx.c.inc" +#include "bn/div.c.inc" +#include "bn/div_extra.c.inc" +#include "bn/exponentiation.c.inc" +#include "bn/gcd.c.inc" +#include "bn/gcd_extra.c.inc" +#include "bn/generic.c.inc" +#include "bn/jacobi.c.inc" +#include "bn/montgomery.c.inc" +#include "bn/montgomery_inv.c.inc" +#include "bn/mul.c.inc" +#include "bn/prime.c.inc" +#include "bn/random.c.inc" +#include "bn/rsaz_exp.c.inc" +#include "bn/shift.c.inc" +#include "bn/sqrt.c.inc" +#include "cipher/aead.c.inc" +#include "cipher/cipher.c.inc" +#include "cipher/e_aes.c.inc" +#include "cipher/e_aesccm.c.inc" +#include "cmac/cmac.c.inc" +#include "dh/check.c.inc" +#include "dh/dh.c.inc" +#include "digest/digest.c.inc" +#include "digest/digests.c.inc" +#include "digestsign/digestsign.c.inc" +#include "ecdh/ecdh.c.inc" +#include "ecdsa/ecdsa.c.inc" +#include "ec/ec.c.inc" +#include "ec/ec_key.c.inc" +#include "ec/ec_montgomery.c.inc" +#include "ec/felem.c.inc" +#include "ec/oct.c.inc" +#include "ec/p224-64.c.inc" +#include "ec/p256.c.inc" +#include "ec/p256-nistz.c.inc" +#include "ec/scalar.c.inc" +#include "ec/simple.c.inc" +#include "ec/simple_mul.c.inc" +#include "ec/util.c.inc" +#include "ec/wnaf.c.inc" +#include "hkdf/hkdf.c.inc" +#include "hmac/hmac.c.inc" +#include "modes/cbc.c.inc" +#include "modes/cfb.c.inc" +#include "modes/ctr.c.inc" +#include "modes/gcm.c.inc" +#include "modes/gcm_nohw.c.inc" +#include "modes/ofb.c.inc" +#include "modes/polyval.c.inc" +#include "rand/ctrdrbg.c.inc" +#include "rand/rand.c.inc" +#include "rsa/blinding.c.inc" +#include "rsa/padding.c.inc" +#include "rsa/rsa.c.inc" +#include "rsa/rsa_impl.c.inc" +#include "self_check/fips.c.inc" +#include "self_check/self_check.c.inc" +#include "service_indicator/service_indicator.c.inc" +#include "sha/sha1.c.inc" +#include "sha/sha256.c.inc" +#include "sha/sha512.c.inc" +#include "tls/kdf.c.inc" + + +#if defined(BORINGSSL_FIPS) + +#if !defined(OPENSSL_ASAN) + +// These symbols are filled in by delocate.go (in static builds) or a linker +// script (in shared builds). They point to the start and end of the module, and +// the location of the integrity hash, respectively. +extern const uint8_t BORINGSSL_bcm_text_start[]; +extern const uint8_t BORINGSSL_bcm_text_end[]; +extern const uint8_t BORINGSSL_bcm_text_hash[]; +#if defined(BORINGSSL_SHARED_LIBRARY) +extern const uint8_t BORINGSSL_bcm_rodata_start[]; +extern const uint8_t BORINGSSL_bcm_rodata_end[]; +#endif + +// assert_within is used to sanity check that certain symbols are within the +// bounds of the integrity check. It checks that start <= symbol < end and +// aborts otherwise. +static void assert_within(const void *start, const void *symbol, + const void *end) { + const uintptr_t start_val = (uintptr_t) start; + const uintptr_t symbol_val = (uintptr_t) symbol; + const uintptr_t end_val = (uintptr_t) end; + + if (start_val <= symbol_val && symbol_val < end_val) { + return; + } + + fprintf( + stderr, + "FIPS module doesn't span expected symbol. Expected %p <= %p < %p\n", + start, symbol, end); + BORINGSSL_FIPS_abort(); +} + +#if defined(OPENSSL_ANDROID) && defined(OPENSSL_AARCH64) +static void BORINGSSL_maybe_set_module_text_permissions(int permission) { + // Android may be compiled in execute-only-memory mode, in which case the + // .text segment cannot be read. That conflicts with the need for a FIPS + // module to hash its own contents, therefore |mprotect| is used to make + // the module's .text readable for the duration of the hashing process. In + // other build configurations this is a no-op. + const uintptr_t page_size = getpagesize(); + const uintptr_t page_start = + ((uintptr_t)BORINGSSL_bcm_text_start) & ~(page_size - 1); + + if (mprotect((void *)page_start, + ((uintptr_t)BORINGSSL_bcm_text_end) - page_start, + permission) != 0) { + perror("BoringSSL: mprotect"); + } +} +#else +static void BORINGSSL_maybe_set_module_text_permissions(int permission) {} +#endif // !ANDROID + +#endif // !ASAN + +static void __attribute__((constructor)) +BORINGSSL_bcm_power_on_self_test(void) { +#if !defined(OPENSSL_ASAN) + // Integrity tests cannot run under ASAN because it involves reading the full + // .text section, which triggers the global-buffer overflow detection. + if (!BORINGSSL_integrity_test()) { + goto err; + } +#endif // OPENSSL_ASAN + + if (!boringssl_self_test_startup()) { + goto err; + } + + return; + +err: + BORINGSSL_FIPS_abort(); +} + +#if !defined(OPENSSL_ASAN) +int BORINGSSL_integrity_test(void) { + const uint8_t *const start = BORINGSSL_bcm_text_start; + const uint8_t *const end = BORINGSSL_bcm_text_end; + + assert_within(start, AES_encrypt, end); + assert_within(start, RSA_sign, end); + assert_within(start, BCM_rand_bytes, end); + assert_within(start, EC_GROUP_cmp, end); + assert_within(start, SHA256_Update, end); + assert_within(start, ecdsa_verify_fixed, end); + assert_within(start, EVP_AEAD_CTX_seal, end); + +#if defined(BORINGSSL_SHARED_LIBRARY) + const uint8_t *const rodata_start = BORINGSSL_bcm_rodata_start; + const uint8_t *const rodata_end = BORINGSSL_bcm_rodata_end; +#else + // In the static build, read-only data is placed within the .text segment. + const uint8_t *const rodata_start = BORINGSSL_bcm_text_start; + const uint8_t *const rodata_end = BORINGSSL_bcm_text_end; +#endif + + assert_within(rodata_start, kPrimes, rodata_end); + assert_within(rodata_start, kP256Field, rodata_end); + assert_within(rodata_start, kPKCS1SigPrefixes, rodata_end); + + uint8_t result[SHA256_DIGEST_LENGTH]; + const EVP_MD *const kHashFunction = EVP_sha256(); + if (!boringssl_self_test_sha256() || + !boringssl_self_test_hmac_sha256()) { + return 0; + } + + static const uint8_t kHMACKey[64] = {0}; + unsigned result_len; + HMAC_CTX hmac_ctx; + HMAC_CTX_init(&hmac_ctx); + if (!HMAC_Init_ex(&hmac_ctx, kHMACKey, sizeof(kHMACKey), kHashFunction, + NULL /* no ENGINE */)) { + fprintf(stderr, "HMAC_Init_ex failed.\n"); + return 0; + } + + BORINGSSL_maybe_set_module_text_permissions(PROT_READ | PROT_EXEC); +#if defined(BORINGSSL_SHARED_LIBRARY) + uint64_t length = end - start; + HMAC_Update(&hmac_ctx, (const uint8_t *) &length, sizeof(length)); + HMAC_Update(&hmac_ctx, start, length); + + length = rodata_end - rodata_start; + HMAC_Update(&hmac_ctx, (const uint8_t *) &length, sizeof(length)); + HMAC_Update(&hmac_ctx, rodata_start, length); +#else + HMAC_Update(&hmac_ctx, start, end - start); +#endif + BORINGSSL_maybe_set_module_text_permissions(PROT_EXEC); + + if (!HMAC_Final(&hmac_ctx, result, &result_len) || + result_len != sizeof(result)) { + fprintf(stderr, "HMAC failed.\n"); + return 0; + } + HMAC_CTX_cleanse(&hmac_ctx); // FIPS 140-3, AS05.10. + + const uint8_t *expected = BORINGSSL_bcm_text_hash; + + if (!check_test(expected, result, sizeof(result), "FIPS integrity test")) { +#if !defined(BORINGSSL_FIPS_BREAK_TESTS) + return 0; +#endif + } + + OPENSSL_cleanse(result, sizeof(result)); // FIPS 140-3, AS05.10. + return 1; +} + +const uint8_t* FIPS_module_hash(void) { + return BORINGSSL_bcm_text_hash; +} + +#endif // OPENSSL_ASAN + +void BORINGSSL_FIPS_abort(void) { + for (;;) { + abort(); + exit(1); + } +} + +#endif // BORINGSSL_FIPS diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm_interface.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm_interface.h new file mode 100644 index 00000000..b5461595 --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm_interface.h @@ -0,0 +1,132 @@ +/* Copyright (c) 2024, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_CRYPTO_BCM_INTERFACE_H +#define OPENSSL_HEADER_CRYPTO_BCM_INTERFACE_H + +#include + +// This header will eventually become the interface between BCM and the +// rest of libcrypto. More cleanly separating the two is still a work in +// progress (see https://crbug.com/boringssl/722) so, at the moment, we +// consider this no different from any other header in BCM. +// +// Over time, calls from libcrypto to BCM will all move to this header +// and the separation will become more meaningful. + +#if defined(__cplusplus) +extern "C" { +#endif + +// Enumerated types for return values from bcm functions, both infallible +// and fallible functions. Two success values are used to correspond to the +// FIPS service indicator. For the moment, the official service indicator +// remains the counter, not these values. Once we fully transition to +// these return values from bcm we will change that. +enum bcm_infallible_t { + bcm_infallible_approved, + bcm_infallible_not_approved, +}; + +enum bcm_status_t { + bcm_status_approved, + bcm_status_not_approved, + + // Failure codes, which must all be negative. + bcm_status_failure, +}; +typedef enum bcm_status_t bcm_status; +typedef enum bcm_infallible_t bcm_infallible; + +OPENSSL_INLINE int bcm_success(bcm_status status) { + return status == bcm_status_approved || status == bcm_status_not_approved; +} + + +// Random number generator. + +#if defined(BORINGSSL_FIPS) + +// We overread from /dev/urandom or RDRAND by a factor of 10 and XOR to whiten. +// TODO(bbe): disentangle this value which is used to calculate the size of the +// stack buffer in RAND_need entropy based on a calculation. +#define BORINGSSL_FIPS_OVERREAD 10 + +#endif // BORINGSSL_FIPS + +// BCM_rand_load_entropy supplies |entropy_len| bytes of entropy to the BCM +// module. The |want_additional_input| parameter is true iff the entropy was +// obtained from a source other than the system, e.g. directly from the CPU. +bcm_infallible BCM_rand_load_entropy(const uint8_t *entropy, size_t entropy_len, + int want_additional_input); + +// BCM_rand_bytes is the same as the public |RAND_bytes| function, other +// than returning a bcm_infallible status indicator. +OPENSSL_EXPORT bcm_infallible BCM_rand_bytes(uint8_t *out, size_t out_len); + +// BCM_rand_bytes_hwrng attempts to fill |out| with |len| bytes of entropy from +// the CPU hardware random number generator if one is present. +// bcm_status_approved is returned on success, and a failure status is +// returned otherwise. +bcm_status BCM_rand_bytes_hwrng(uint8_t *out, size_t len); + +// BCM_rand_bytes_with_additional_data samples from the RNG after mixing 32 +// bytes from |user_additional_data| in. +bcm_infallible BCM_rand_bytes_with_additional_data( + uint8_t *out, size_t out_len, const uint8_t user_additional_data[32]); + + +// SHA-1 + +// BCM_SHA_DIGEST_LENGTH is the length of a SHA-1 digest. +#define BCM_SHA_DIGEST_LENGTH 20 + +// BCM_sha1_init initialises |sha|. +bcm_infallible BCM_sha1_init(SHA_CTX *sha); + +// BCM_SHA1_transform is a low-level function that performs a single, SHA-1 +// block transformation using the state from |sha| and |SHA_CBLOCK| bytes from +// |block|. +bcm_infallible BCM_sha1_transform(SHA_CTX *c, const uint8_t data[BCM_SHA_CBLOCK]); + +// BCM_sha1_update adds |len| bytes from |data| to |sha|. +bcm_infallible BCM_sha1_update(SHA_CTX *c, const void *data, size_t len); + +// BCM_sha1_final adds the final padding to |sha| and writes the resulting +// digest to |out|, which must have at least |SHA_DIGEST_LENGTH| bytes of space. +bcm_infallible BCM_sha1_final(uint8_t out[BCM_SHA_DIGEST_LENGTH], SHA_CTX *c); + + +// BCM_fips_186_2_prf derives |out_len| bytes from |xkey| using the PRF +// defined in FIPS 186-2, Appendix 3.1, with change notice 1 applied. The b +// parameter is 160 and seed, XKEY, is also 160 bits. The optional XSEED user +// input is all zeros. +// +// The PRF generates a sequence of 320-bit numbers. Each number is encoded as a +// 40-byte string in big-endian and then concatenated to form |out|. If +// |out_len| is not a multiple of 40, the result is truncated. This matches the +// construction used in Section 7 of RFC 4186 and Section 7 of RFC 4187. +// +// This PRF is based on SHA-1, a weak hash function, and should not be used +// in new protocols. It is provided for compatibility with some legacy EAP +// methods. +bcm_infallible BCM_fips_186_2_prf(uint8_t *out, size_t out_len, + const uint8_t xkey[BCM_SHA_DIGEST_LENGTH]); + + +#if defined(__cplusplus) +} // extern C +#endif + +#endif // OPENSSL_HEADER_CRYPTO_BCM_INTERFACE_H diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-windows.windows.x86.S deleted file mode 100644 index d700276e..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-windows.windows.x86.S +++ /dev/null @@ -1,989 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -;extern _OPENSSL_ia32cap_P -global _bn_mul_add_words -align 16 -_bn_mul_add_words: -L$_bn_mul_add_words_begin: - lea eax,[_OPENSSL_ia32cap_P] - bt DWORD [eax],26 - jnc NEAR L$000maw_non_sse2 - mov eax,DWORD [4+esp] - mov edx,DWORD [8+esp] - mov ecx,DWORD [12+esp] - movd mm0,DWORD [16+esp] - pxor mm1,mm1 - jmp NEAR L$001maw_sse2_entry -align 16 -L$002maw_sse2_unrolled: - movd mm3,DWORD [eax] - paddq mm1,mm3 - movd mm2,DWORD [edx] - pmuludq mm2,mm0 - movd mm4,DWORD [4+edx] - pmuludq mm4,mm0 - movd mm6,DWORD [8+edx] - pmuludq mm6,mm0 - movd mm7,DWORD [12+edx] - pmuludq mm7,mm0 - paddq mm1,mm2 - movd mm3,DWORD [4+eax] - paddq mm3,mm4 - movd mm5,DWORD [8+eax] - paddq mm5,mm6 - movd mm4,DWORD [12+eax] - paddq mm7,mm4 - movd DWORD [eax],mm1 - movd mm2,DWORD [16+edx] - pmuludq mm2,mm0 - psrlq mm1,32 - movd mm4,DWORD [20+edx] - pmuludq mm4,mm0 - paddq mm1,mm3 - movd mm6,DWORD [24+edx] - pmuludq mm6,mm0 - movd DWORD [4+eax],mm1 - psrlq mm1,32 - movd mm3,DWORD [28+edx] - add edx,32 - pmuludq mm3,mm0 - paddq mm1,mm5 - movd mm5,DWORD [16+eax] - paddq mm2,mm5 - movd DWORD [8+eax],mm1 - psrlq mm1,32 - paddq mm1,mm7 - movd mm5,DWORD [20+eax] - paddq mm4,mm5 - movd DWORD [12+eax],mm1 - psrlq mm1,32 - paddq mm1,mm2 - movd mm5,DWORD [24+eax] - paddq mm6,mm5 - movd DWORD [16+eax],mm1 - psrlq mm1,32 - paddq mm1,mm4 - movd mm5,DWORD [28+eax] - paddq mm3,mm5 - movd DWORD [20+eax],mm1 - psrlq mm1,32 - paddq mm1,mm6 - movd DWORD [24+eax],mm1 - psrlq mm1,32 - paddq mm1,mm3 - movd DWORD [28+eax],mm1 - lea eax,[32+eax] - psrlq mm1,32 - sub ecx,8 - jz NEAR L$003maw_sse2_exit -L$001maw_sse2_entry: - test ecx,4294967288 - jnz NEAR L$002maw_sse2_unrolled -align 4 -L$004maw_sse2_loop: - movd mm2,DWORD [edx] - movd mm3,DWORD [eax] - pmuludq mm2,mm0 - lea edx,[4+edx] - paddq mm1,mm3 - paddq mm1,mm2 - movd DWORD [eax],mm1 - sub ecx,1 - psrlq mm1,32 - lea eax,[4+eax] - jnz NEAR L$004maw_sse2_loop -L$003maw_sse2_exit: - movd eax,mm1 - emms - ret -align 16 -L$000maw_non_sse2: - push ebp - push ebx - push esi - push edi - ; - xor esi,esi - mov edi,DWORD [20+esp] - mov ecx,DWORD [28+esp] - mov ebx,DWORD [24+esp] - and ecx,4294967288 - mov ebp,DWORD [32+esp] - push ecx - jz NEAR L$005maw_finish -align 16 -L$006maw_loop: - ; Round 0 - mov eax,DWORD [ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [edi] - adc edx,0 - mov DWORD [edi],eax - mov esi,edx - ; Round 4 - mov eax,DWORD [4+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [4+edi] - adc edx,0 - mov DWORD [4+edi],eax - mov esi,edx - ; Round 8 - mov eax,DWORD [8+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [8+edi] - adc edx,0 - mov DWORD [8+edi],eax - mov esi,edx - ; Round 12 - mov eax,DWORD [12+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [12+edi] - adc edx,0 - mov DWORD [12+edi],eax - mov esi,edx - ; Round 16 - mov eax,DWORD [16+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [16+edi] - adc edx,0 - mov DWORD [16+edi],eax - mov esi,edx - ; Round 20 - mov eax,DWORD [20+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [20+edi] - adc edx,0 - mov DWORD [20+edi],eax - mov esi,edx - ; Round 24 - mov eax,DWORD [24+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [24+edi] - adc edx,0 - mov DWORD [24+edi],eax - mov esi,edx - ; Round 28 - mov eax,DWORD [28+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [28+edi] - adc edx,0 - mov DWORD [28+edi],eax - mov esi,edx - ; - sub ecx,8 - lea ebx,[32+ebx] - lea edi,[32+edi] - jnz NEAR L$006maw_loop -L$005maw_finish: - mov ecx,DWORD [32+esp] - and ecx,7 - jnz NEAR L$007maw_finish2 - jmp NEAR L$008maw_end -L$007maw_finish2: - ; Tail Round 0 - mov eax,DWORD [ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [edi] - adc edx,0 - dec ecx - mov DWORD [edi],eax - mov esi,edx - jz NEAR L$008maw_end - ; Tail Round 1 - mov eax,DWORD [4+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [4+edi] - adc edx,0 - dec ecx - mov DWORD [4+edi],eax - mov esi,edx - jz NEAR L$008maw_end - ; Tail Round 2 - mov eax,DWORD [8+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [8+edi] - adc edx,0 - dec ecx - mov DWORD [8+edi],eax - mov esi,edx - jz NEAR L$008maw_end - ; Tail Round 3 - mov eax,DWORD [12+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [12+edi] - adc edx,0 - dec ecx - mov DWORD [12+edi],eax - mov esi,edx - jz NEAR L$008maw_end - ; Tail Round 4 - mov eax,DWORD [16+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [16+edi] - adc edx,0 - dec ecx - mov DWORD [16+edi],eax - mov esi,edx - jz NEAR L$008maw_end - ; Tail Round 5 - mov eax,DWORD [20+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [20+edi] - adc edx,0 - dec ecx - mov DWORD [20+edi],eax - mov esi,edx - jz NEAR L$008maw_end - ; Tail Round 6 - mov eax,DWORD [24+ebx] - mul ebp - add eax,esi - adc edx,0 - add eax,DWORD [24+edi] - adc edx,0 - mov DWORD [24+edi],eax - mov esi,edx -L$008maw_end: - mov eax,esi - pop ecx - pop edi - pop esi - pop ebx - pop ebp - ret -global _bn_mul_words -align 16 -_bn_mul_words: -L$_bn_mul_words_begin: - lea eax,[_OPENSSL_ia32cap_P] - bt DWORD [eax],26 - jnc NEAR L$009mw_non_sse2 - mov eax,DWORD [4+esp] - mov edx,DWORD [8+esp] - mov ecx,DWORD [12+esp] - movd mm0,DWORD [16+esp] - pxor mm1,mm1 -align 16 -L$010mw_sse2_loop: - movd mm2,DWORD [edx] - pmuludq mm2,mm0 - lea edx,[4+edx] - paddq mm1,mm2 - movd DWORD [eax],mm1 - sub ecx,1 - psrlq mm1,32 - lea eax,[4+eax] - jnz NEAR L$010mw_sse2_loop - movd eax,mm1 - emms - ret -align 16 -L$009mw_non_sse2: - push ebp - push ebx - push esi - push edi - ; - xor esi,esi - mov edi,DWORD [20+esp] - mov ebx,DWORD [24+esp] - mov ebp,DWORD [28+esp] - mov ecx,DWORD [32+esp] - and ebp,4294967288 - jz NEAR L$011mw_finish -L$012mw_loop: - ; Round 0 - mov eax,DWORD [ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [edi],eax - mov esi,edx - ; Round 4 - mov eax,DWORD [4+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [4+edi],eax - mov esi,edx - ; Round 8 - mov eax,DWORD [8+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [8+edi],eax - mov esi,edx - ; Round 12 - mov eax,DWORD [12+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [12+edi],eax - mov esi,edx - ; Round 16 - mov eax,DWORD [16+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [16+edi],eax - mov esi,edx - ; Round 20 - mov eax,DWORD [20+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [20+edi],eax - mov esi,edx - ; Round 24 - mov eax,DWORD [24+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [24+edi],eax - mov esi,edx - ; Round 28 - mov eax,DWORD [28+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [28+edi],eax - mov esi,edx - ; - add ebx,32 - add edi,32 - sub ebp,8 - jz NEAR L$011mw_finish - jmp NEAR L$012mw_loop -L$011mw_finish: - mov ebp,DWORD [28+esp] - and ebp,7 - jnz NEAR L$013mw_finish2 - jmp NEAR L$014mw_end -L$013mw_finish2: - ; Tail Round 0 - mov eax,DWORD [ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [edi],eax - mov esi,edx - dec ebp - jz NEAR L$014mw_end - ; Tail Round 1 - mov eax,DWORD [4+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [4+edi],eax - mov esi,edx - dec ebp - jz NEAR L$014mw_end - ; Tail Round 2 - mov eax,DWORD [8+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [8+edi],eax - mov esi,edx - dec ebp - jz NEAR L$014mw_end - ; Tail Round 3 - mov eax,DWORD [12+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [12+edi],eax - mov esi,edx - dec ebp - jz NEAR L$014mw_end - ; Tail Round 4 - mov eax,DWORD [16+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [16+edi],eax - mov esi,edx - dec ebp - jz NEAR L$014mw_end - ; Tail Round 5 - mov eax,DWORD [20+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [20+edi],eax - mov esi,edx - dec ebp - jz NEAR L$014mw_end - ; Tail Round 6 - mov eax,DWORD [24+ebx] - mul ecx - add eax,esi - adc edx,0 - mov DWORD [24+edi],eax - mov esi,edx -L$014mw_end: - mov eax,esi - pop edi - pop esi - pop ebx - pop ebp - ret -global _bn_sqr_words -align 16 -_bn_sqr_words: -L$_bn_sqr_words_begin: - lea eax,[_OPENSSL_ia32cap_P] - bt DWORD [eax],26 - jnc NEAR L$015sqr_non_sse2 - mov eax,DWORD [4+esp] - mov edx,DWORD [8+esp] - mov ecx,DWORD [12+esp] -align 16 -L$016sqr_sse2_loop: - movd mm0,DWORD [edx] - pmuludq mm0,mm0 - lea edx,[4+edx] - movq [eax],mm0 - sub ecx,1 - lea eax,[8+eax] - jnz NEAR L$016sqr_sse2_loop - emms - ret -align 16 -L$015sqr_non_sse2: - push ebp - push ebx - push esi - push edi - ; - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov ebx,DWORD [28+esp] - and ebx,4294967288 - jz NEAR L$017sw_finish -L$018sw_loop: - ; Round 0 - mov eax,DWORD [edi] - mul eax - mov DWORD [esi],eax - mov DWORD [4+esi],edx - ; Round 4 - mov eax,DWORD [4+edi] - mul eax - mov DWORD [8+esi],eax - mov DWORD [12+esi],edx - ; Round 8 - mov eax,DWORD [8+edi] - mul eax - mov DWORD [16+esi],eax - mov DWORD [20+esi],edx - ; Round 12 - mov eax,DWORD [12+edi] - mul eax - mov DWORD [24+esi],eax - mov DWORD [28+esi],edx - ; Round 16 - mov eax,DWORD [16+edi] - mul eax - mov DWORD [32+esi],eax - mov DWORD [36+esi],edx - ; Round 20 - mov eax,DWORD [20+edi] - mul eax - mov DWORD [40+esi],eax - mov DWORD [44+esi],edx - ; Round 24 - mov eax,DWORD [24+edi] - mul eax - mov DWORD [48+esi],eax - mov DWORD [52+esi],edx - ; Round 28 - mov eax,DWORD [28+edi] - mul eax - mov DWORD [56+esi],eax - mov DWORD [60+esi],edx - ; - add edi,32 - add esi,64 - sub ebx,8 - jnz NEAR L$018sw_loop -L$017sw_finish: - mov ebx,DWORD [28+esp] - and ebx,7 - jz NEAR L$019sw_end - ; Tail Round 0 - mov eax,DWORD [edi] - mul eax - mov DWORD [esi],eax - dec ebx - mov DWORD [4+esi],edx - jz NEAR L$019sw_end - ; Tail Round 1 - mov eax,DWORD [4+edi] - mul eax - mov DWORD [8+esi],eax - dec ebx - mov DWORD [12+esi],edx - jz NEAR L$019sw_end - ; Tail Round 2 - mov eax,DWORD [8+edi] - mul eax - mov DWORD [16+esi],eax - dec ebx - mov DWORD [20+esi],edx - jz NEAR L$019sw_end - ; Tail Round 3 - mov eax,DWORD [12+edi] - mul eax - mov DWORD [24+esi],eax - dec ebx - mov DWORD [28+esi],edx - jz NEAR L$019sw_end - ; Tail Round 4 - mov eax,DWORD [16+edi] - mul eax - mov DWORD [32+esi],eax - dec ebx - mov DWORD [36+esi],edx - jz NEAR L$019sw_end - ; Tail Round 5 - mov eax,DWORD [20+edi] - mul eax - mov DWORD [40+esi],eax - dec ebx - mov DWORD [44+esi],edx - jz NEAR L$019sw_end - ; Tail Round 6 - mov eax,DWORD [24+edi] - mul eax - mov DWORD [48+esi],eax - mov DWORD [52+esi],edx -L$019sw_end: - pop edi - pop esi - pop ebx - pop ebp - ret -global _bn_div_words -align 16 -_bn_div_words: -L$_bn_div_words_begin: - mov edx,DWORD [4+esp] - mov eax,DWORD [8+esp] - mov ecx,DWORD [12+esp] - div ecx - ret -global _bn_add_words -align 16 -_bn_add_words: -L$_bn_add_words_begin: - push ebp - push ebx - push esi - push edi - ; - mov ebx,DWORD [20+esp] - mov esi,DWORD [24+esp] - mov edi,DWORD [28+esp] - mov ebp,DWORD [32+esp] - xor eax,eax - and ebp,4294967288 - jz NEAR L$020aw_finish -L$021aw_loop: - ; Round 0 - mov ecx,DWORD [esi] - mov edx,DWORD [edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - mov DWORD [ebx],ecx - ; Round 1 - mov ecx,DWORD [4+esi] - mov edx,DWORD [4+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - mov DWORD [4+ebx],ecx - ; Round 2 - mov ecx,DWORD [8+esi] - mov edx,DWORD [8+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - mov DWORD [8+ebx],ecx - ; Round 3 - mov ecx,DWORD [12+esi] - mov edx,DWORD [12+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - mov DWORD [12+ebx],ecx - ; Round 4 - mov ecx,DWORD [16+esi] - mov edx,DWORD [16+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - mov DWORD [16+ebx],ecx - ; Round 5 - mov ecx,DWORD [20+esi] - mov edx,DWORD [20+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - mov DWORD [20+ebx],ecx - ; Round 6 - mov ecx,DWORD [24+esi] - mov edx,DWORD [24+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - mov DWORD [24+ebx],ecx - ; Round 7 - mov ecx,DWORD [28+esi] - mov edx,DWORD [28+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - mov DWORD [28+ebx],ecx - ; - add esi,32 - add edi,32 - add ebx,32 - sub ebp,8 - jnz NEAR L$021aw_loop -L$020aw_finish: - mov ebp,DWORD [32+esp] - and ebp,7 - jz NEAR L$022aw_end - ; Tail Round 0 - mov ecx,DWORD [esi] - mov edx,DWORD [edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - dec ebp - mov DWORD [ebx],ecx - jz NEAR L$022aw_end - ; Tail Round 1 - mov ecx,DWORD [4+esi] - mov edx,DWORD [4+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - dec ebp - mov DWORD [4+ebx],ecx - jz NEAR L$022aw_end - ; Tail Round 2 - mov ecx,DWORD [8+esi] - mov edx,DWORD [8+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - dec ebp - mov DWORD [8+ebx],ecx - jz NEAR L$022aw_end - ; Tail Round 3 - mov ecx,DWORD [12+esi] - mov edx,DWORD [12+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - dec ebp - mov DWORD [12+ebx],ecx - jz NEAR L$022aw_end - ; Tail Round 4 - mov ecx,DWORD [16+esi] - mov edx,DWORD [16+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - dec ebp - mov DWORD [16+ebx],ecx - jz NEAR L$022aw_end - ; Tail Round 5 - mov ecx,DWORD [20+esi] - mov edx,DWORD [20+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - dec ebp - mov DWORD [20+ebx],ecx - jz NEAR L$022aw_end - ; Tail Round 6 - mov ecx,DWORD [24+esi] - mov edx,DWORD [24+edi] - add ecx,eax - mov eax,0 - adc eax,eax - add ecx,edx - adc eax,0 - mov DWORD [24+ebx],ecx -L$022aw_end: - pop edi - pop esi - pop ebx - pop ebp - ret -global _bn_sub_words -align 16 -_bn_sub_words: -L$_bn_sub_words_begin: - push ebp - push ebx - push esi - push edi - ; - mov ebx,DWORD [20+esp] - mov esi,DWORD [24+esp] - mov edi,DWORD [28+esp] - mov ebp,DWORD [32+esp] - xor eax,eax - and ebp,4294967288 - jz NEAR L$023aw_finish -L$024aw_loop: - ; Round 0 - mov ecx,DWORD [esi] - mov edx,DWORD [edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - mov DWORD [ebx],ecx - ; Round 1 - mov ecx,DWORD [4+esi] - mov edx,DWORD [4+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - mov DWORD [4+ebx],ecx - ; Round 2 - mov ecx,DWORD [8+esi] - mov edx,DWORD [8+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - mov DWORD [8+ebx],ecx - ; Round 3 - mov ecx,DWORD [12+esi] - mov edx,DWORD [12+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - mov DWORD [12+ebx],ecx - ; Round 4 - mov ecx,DWORD [16+esi] - mov edx,DWORD [16+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - mov DWORD [16+ebx],ecx - ; Round 5 - mov ecx,DWORD [20+esi] - mov edx,DWORD [20+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - mov DWORD [20+ebx],ecx - ; Round 6 - mov ecx,DWORD [24+esi] - mov edx,DWORD [24+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - mov DWORD [24+ebx],ecx - ; Round 7 - mov ecx,DWORD [28+esi] - mov edx,DWORD [28+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - mov DWORD [28+ebx],ecx - ; - add esi,32 - add edi,32 - add ebx,32 - sub ebp,8 - jnz NEAR L$024aw_loop -L$023aw_finish: - mov ebp,DWORD [32+esp] - and ebp,7 - jz NEAR L$025aw_end - ; Tail Round 0 - mov ecx,DWORD [esi] - mov edx,DWORD [edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - dec ebp - mov DWORD [ebx],ecx - jz NEAR L$025aw_end - ; Tail Round 1 - mov ecx,DWORD [4+esi] - mov edx,DWORD [4+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - dec ebp - mov DWORD [4+ebx],ecx - jz NEAR L$025aw_end - ; Tail Round 2 - mov ecx,DWORD [8+esi] - mov edx,DWORD [8+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - dec ebp - mov DWORD [8+ebx],ecx - jz NEAR L$025aw_end - ; Tail Round 3 - mov ecx,DWORD [12+esi] - mov edx,DWORD [12+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - dec ebp - mov DWORD [12+ebx],ecx - jz NEAR L$025aw_end - ; Tail Round 4 - mov ecx,DWORD [16+esi] - mov edx,DWORD [16+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - dec ebp - mov DWORD [16+ebx],ecx - jz NEAR L$025aw_end - ; Tail Round 5 - mov ecx,DWORD [20+esi] - mov edx,DWORD [20+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - dec ebp - mov DWORD [20+ebx],ecx - jz NEAR L$025aw_end - ; Tail Round 6 - mov ecx,DWORD [24+esi] - mov edx,DWORD [24+edi] - sub ecx,eax - mov eax,0 - adc eax,eax - sub ecx,edx - adc eax,0 - mov DWORD [24+ebx],ecx -L$025aw_end: - pop edi - pop esi - pop ebx - pop ebp - ret -segment .bss -common _OPENSSL_ia32cap_P 16 -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/add.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/add.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/add.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/add.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/asm/x86_64-gcc.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/asm/x86_64-gcc.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/asm/x86_64-gcc.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/asm/x86_64-gcc.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bn.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bn.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bn.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bn.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c.inc similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c.inc index e3475518..124f4418 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c.inc @@ -186,7 +186,7 @@ void bn_assert_fits_in_bytes(const BIGNUM *bn, size_t num) { void bn_words_to_big_endian(uint8_t *out, size_t out_len, const BN_ULONG *in, size_t in_len) { // The caller should have selected an output length without truncation. - assert(fits_in_bytes(in, in_len, out_len)); + declassify_assert(fits_in_bytes(in, in_len, out_len)); // We only support little-endian platforms, so the internal representation is // also little-endian as bytes. We can simply copy it in reverse. diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/cmp.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/cmp.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/cmp.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/cmp.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/ctx.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/ctx.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/ctx.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/ctx.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c.inc similarity index 75% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c.inc index 4233f4a9..f15843fd 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c.inc @@ -149,11 +149,11 @@ static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out, // * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65668 // // Clang bugs: - // * https://llvm.org/bugs/show_bug.cgi?id=6397 - // * https://llvm.org/bugs/show_bug.cgi?id=12418 + // * https://github.com/llvm/llvm-project/issues/6769 + // * https://github.com/llvm/llvm-project/issues/12790 // - // These issues aren't specific to x86 and x86_64, so it might be worthwhile - // to add more assembly language implementations. + // These is specific to x86 and x86_64; Arm and RISC-V do not have double-wide + // division instructions. #if defined(BN_CAN_USE_INLINE_ASM) && defined(OPENSSL_X86) __asm__ volatile("divl %4" : "=a"(*quotient_out), "=d"(*rem_out) @@ -175,44 +175,16 @@ static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out, #endif } -// BN_div computes "quotient := numerator / divisor", rounding towards zero, -// and sets up |rem| such that "quotient * divisor + rem = numerator" holds. -// -// Thus: -// -// quotient->neg == numerator->neg ^ divisor->neg -// (unless the result is zero) -// rem->neg == numerator->neg -// (unless the remainder is zero) -// -// If |quotient| or |rem| is NULL, the respective value is not returned. -// -// This was specifically designed to contain fewer branches that may leak -// sensitive information; see "New Branch Prediction Vulnerabilities in OpenSSL -// and Necessary Software Countermeasures" by Onur Acıçmez, Shay Gueron, and -// Jean-Pierre Seifert. int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator, const BIGNUM *divisor, BN_CTX *ctx) { - int norm_shift, loop; - BIGNUM wnum; - BN_ULONG *resp, *wnump; - BN_ULONG d0, d1; - int num_n, div_n; - - // This function relies on the historical minimal-width |BIGNUM| invariant. - // It is already not constant-time (constant-time reductions should use - // Montgomery logic), so we shrink all inputs and intermediate values to - // retain the previous behavior. - - // Invalid zero-padding would have particularly bad consequences. - int numerator_width = bn_minimal_width(numerator); - int divisor_width = bn_minimal_width(divisor); - if ((numerator_width > 0 && numerator->d[numerator_width - 1] == 0) || - (divisor_width > 0 && divisor->d[divisor_width - 1] == 0)) { - OPENSSL_PUT_ERROR(BN, BN_R_NOT_INITIALIZED); - return 0; - } - + // This function implements long division, per Knuth, The Art of Computer + // Programming, Volume 2, Chapter 4.3.1, Algorithm D. This algorithm only + // divides non-negative integers, but we round towards zero, so we divide + // absolute values and adjust the signs separately. + // + // Inputs to this function are assumed public and may be leaked by timing and + // cache side channels. Division with secret inputs should use other + // implementation strategies such as Montgomery reduction. if (BN_is_zero(divisor)) { OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO); return 0; @@ -222,174 +194,168 @@ int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator, BIGNUM *tmp = BN_CTX_get(ctx); BIGNUM *snum = BN_CTX_get(ctx); BIGNUM *sdiv = BN_CTX_get(ctx); - BIGNUM *res = NULL; - if (quotient == NULL) { - res = BN_CTX_get(ctx); - } else { - res = quotient; - } - if (sdiv == NULL || res == NULL) { + BIGNUM *res = quotient == NULL ? BN_CTX_get(ctx) : quotient; + if (tmp == NULL || snum == NULL || sdiv == NULL || res == NULL) { goto err; } - // First we normalise the numbers - norm_shift = BN_BITS2 - (BN_num_bits(divisor) % BN_BITS2); - if (!BN_lshift(sdiv, divisor, norm_shift)) { + // Knuth step D1: Normalise the numbers such that the divisor's MSB is set. + // This ensures, in Knuth's terminology, that v1 >= b/2, needed for the + // quotient estimation step. + int norm_shift = BN_BITS2 - (BN_num_bits(divisor) % BN_BITS2); + if (!BN_lshift(sdiv, divisor, norm_shift) || + !BN_lshift(snum, numerator, norm_shift)) { goto err; } + + // This algorithm relies on |sdiv| being minimal width. We do not use this + // function on secret inputs, so leaking this is fine. Also minimize |snum| to + // avoid looping on leading zeros, as we're not trying to be leak-free. bn_set_minimal_width(sdiv); - sdiv->neg = 0; - norm_shift += BN_BITS2; - if (!BN_lshift(snum, numerator, norm_shift)) { - goto err; - } bn_set_minimal_width(snum); - snum->neg = 0; - - // Since we don't want to have special-case logic for the case where snum is - // larger than sdiv, we pad snum with enough zeroes without changing its - // value. - if (snum->width <= sdiv->width + 1) { - if (!bn_wexpand(snum, sdiv->width + 2)) { - goto err; - } - for (int i = snum->width; i < sdiv->width + 2; i++) { - snum->d[i] = 0; - } - snum->width = sdiv->width + 2; - } else { - if (!bn_wexpand(snum, snum->width + 1)) { - goto err; - } - snum->d[snum->width] = 0; - snum->width++; - } - - div_n = sdiv->width; - num_n = snum->width; - loop = num_n - div_n; - // Lets setup a 'window' into snum - // This is the part that corresponds to the current - // 'area' being divided - wnum.neg = 0; - wnum.d = &(snum->d[loop]); - wnum.width = div_n; - // only needed when BN_ucmp messes up the values between width and max - wnum.dmax = snum->dmax - loop; // so we don't step out of bounds - - // Get the top 2 words of sdiv - // div_n=sdiv->width; - d0 = sdiv->d[div_n - 1]; - d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2]; - - // pointer to the 'top' of snum - wnump = &(snum->d[num_n - 1]); - - // Setup |res|. |numerator| and |res| may alias, so we save |numerator->neg| - // for later. - const int numerator_neg = numerator->neg; - res->neg = (numerator_neg ^ divisor->neg); - if (!bn_wexpand(res, loop + 1)) { + int div_n = sdiv->width; + const BN_ULONG d0 = sdiv->d[div_n - 1]; + const BN_ULONG d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2]; + assert(d0 & (((BN_ULONG)1) << (BN_BITS2 - 1))); + + // Extend |snum| with zeros to satisfy the long division invariants: + // - |snum| must have at least |div_n| + 1 words. + // - |snum|'s most significant word must be zero to guarantee the first loop + // iteration works with a prefix greater than |sdiv|. (This is the extra u0 + // digit in Knuth step D1.) + int num_n = snum->width <= div_n ? div_n + 1 : snum->width + 1; + if (!bn_resize_words(snum, num_n)) { goto err; } - res->width = loop - 1; - resp = &(res->d[loop - 1]); - // space for temp - if (!bn_wexpand(tmp, div_n + 1)) { + // Knuth step D2: The quotient's width is the difference between numerator and + // denominator. Also set up its sign and size a temporary for the loop. + int loop = num_n - div_n; + res->neg = snum->neg ^ sdiv->neg; + if (!bn_wexpand(res, loop) || // + !bn_wexpand(tmp, div_n + 1)) { goto err; } - - // if res->width == 0 then clear the neg value otherwise decrease - // the resp pointer - if (res->width == 0) { - res->neg = 0; - } else { - resp--; - } - - for (int i = 0; i < loop - 1; i++, wnump--, resp--) { - BN_ULONG q, l0; - // the first part of the loop uses the top two words of snum and sdiv to - // calculate a BN_ULONG q such that | wnum - sdiv * q | < sdiv - BN_ULONG n0, n1, rm = 0; - - n0 = wnump[0]; - n1 = wnump[-1]; + res->width = loop; + + // Knuth steps D2 through D7: Compute the quotient with a word-by-word long + // division. Note that Knuth indexes words from most to least significant, so + // our index is reversed. Each loop iteration computes res->d[i] of the + // quotient and updates snum with the running remainder. Before each loop + // iteration, the div_n words beginning at snum->d[i+1] must be less than + // snum. + for (int i = loop - 1; i >= 0; i--) { + // The next word of the quotient, q, is floor(wnum / sdiv), where wnum is + // the div_n + 1 words beginning at snum->d[i]. i starts at + // num_n - div_n - 1, so there are at least div_n + 1 words available. + // + // Knuth step D3: Compute q', an estimate of q by looking at the top words + // of wnum and sdiv. We must estimate such that q' = q or q' = q + 1. + BN_ULONG q, rm = 0; + BN_ULONG *wnum = snum->d + i; + BN_ULONG n0 = wnum[div_n]; + BN_ULONG n1 = wnum[div_n - 1]; if (n0 == d0) { + // Estimate q' = b - 1, where b is the base. q = BN_MASK2; + // Knuth also runs the fixup routine in this case, but this would require + // computing rm and is unnecessary. q' is already close enough. That is, + // the true quotient, q is either b - 1 or b - 2. + // + // By the loop invariant, q <= b - 1, so we must show that q >= b - 2. We + // do this by showing wnum / sdiv >= b - 2. Suppose wnum / sdiv < b - 2. + // wnum and sdiv have the same most significant word, so: + // + // wnum >= n0 * b^div_n + // sdiv < (n0 + 1) * b^(d_div - 1) + // + // Thus: + // + // b - 2 > wnum / sdiv + // > (n0 * b^div_n) / (n0 + 1) * b^(div_n - 1) + // = (n0 * b) / (n0 + 1) + // + // (n0 + 1) * (b - 2) > n0 * b + // n0 * b + b - 2 * n0 - 2 > n0 * b + // b - 2 > 2 * n0 + // b/2 - 1 > n0 + // + // This contradicts the normalization condition, so q >= b - 2 and our + // estimate is close enough. } else { - // n0 < d0 + // Estimate q' = floor(n0n1 / d0). Per Theorem B, q' - 2 <= q <= q', which + // is slightly outside of our bounds. + assert(n0 < d0); bn_div_rem_words(&q, &rm, n0, n1, d0); + // Fix the estimate by examining one more word and adjusting q' as needed. + // This is the second half of step D3 and is sufficient per exercises 19, + // 20, and 21. Although only one iteration is needed to correct q + 2 to + // q + 1, Knuth uses a loop. A loop will often also correct q + 1 to q, + // saving the slightly more expensive underflow handling below. + if (div_n > 1) { + BN_ULONG n2 = wnum[div_n - 2]; #ifdef BN_ULLONG - BN_ULLONG t2 = (BN_ULLONG)d1 * q; - for (;;) { - if (t2 <= ((((BN_ULLONG)rm) << BN_BITS2) | wnump[-2])) { - break; + BN_ULLONG t2 = (BN_ULLONG)d1 * q; + for (;;) { + if (t2 <= ((((BN_ULLONG)rm) << BN_BITS2) | n2)) { + break; + } + q--; + rm += d0; + if (rm < d0) { + // If rm overflows, the true value exceeds BN_ULONG and the next + // t2 comparison should exit the loop. + break; + } + t2 -= d1; } - q--; - rm += d0; - if (rm < d0) { - break; // don't let rm overflow - } - t2 -= d1; - } -#else // !BN_ULLONG - BN_ULONG t2l, t2h; - BN_UMULT_LOHI(t2l, t2h, d1, q); - for (;;) { - if (t2h < rm || - (t2h == rm && t2l <= wnump[-2])) { - break; +#else // !BN_ULLONG + BN_ULONG t2l, t2h; + BN_UMULT_LOHI(t2l, t2h, d1, q); + for (;;) { + if (t2h < rm || (t2h == rm && t2l <= n2)) { + break; + } + q--; + rm += d0; + if (rm < d0) { + // If rm overflows, the true value exceeds BN_ULONG and the next + // t2 comparison should exit the loop. + break; + } + if (t2l < d1) { + t2h--; + } + t2l -= d1; } - q--; - rm += d0; - if (rm < d0) { - break; // don't let rm overflow - } - if (t2l < d1) { - t2h--; - } - t2l -= d1; - } #endif // !BN_ULLONG + } } - l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q); - tmp->d[div_n] = l0; - wnum.d--; - // ingore top values of the bignums just sub the two - // BN_ULONG arrays with bn_sub_words - if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n + 1)) { - // Note: As we have considered only the leading - // two BN_ULONGs in the calculation of q, sdiv * q - // might be greater than wnum (but then (q-1) * sdiv - // is less or equal than wnum) + // Knuth step D4 through D6: Now q' = q or q' = q + 1, and + // -sdiv < wnum - sdiv * q < sdiv. If q' = q + 1, the subtraction will + // underflow, and we fix it up below. + tmp->d[div_n] = bn_mul_words(tmp->d, sdiv->d, div_n, q); + if (bn_sub_words(wnum, wnum, tmp->d, div_n + 1)) { q--; - if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) { - // we can't have an overflow here (assuming - // that q != 0, but if q == 0 then tmp is - // zero anyway) - (*wnump)++; - } + // The final addition is expected to overflow, canceling the underflow. + wnum[div_n] += bn_add_words(wnum, wnum, sdiv->d, div_n); } - // store part of the result - *resp = q; + + // q is now correct, and wnum has been updated to the running remainder. + res->d[i] = q; } + // Trim leading zeros and correct any negative zeros. bn_set_minimal_width(snum); + bn_set_minimal_width(res); - if (rem != NULL) { - if (!BN_rshift(rem, snum, norm_shift)) { - goto err; - } - if (!BN_is_zero(rem)) { - rem->neg = numerator_neg; - } + // Knuth step D8: Unnormalize. snum now contains the remainder. + if (rem != NULL && !BN_rshift(rem, snum, norm_shift)) { + goto err; } - bn_set_minimal_width(res); BN_CTX_end(ctx); return 1; @@ -406,8 +372,9 @@ int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx) { return 1; } - // now -|d| < r < 0, so we have to set r := r + |d|. - return (d->neg ? BN_sub : BN_add)(r, r, d); + // now -d < r < 0, so we have to set r := r + d. Ignoring the sign bits, this + // is r = d - r. + return BN_usub(r, d, r); } BN_ULONG bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry, @@ -425,7 +392,7 @@ BN_ULONG bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry, // // Although |carry| may be one if it was one on input and |bn_sub_words| // returns zero, this would give |r| > |m|, violating our input assumptions. - assert(carry == 0 || carry == (BN_ULONG)-1); + declassify_assert(carry + 1 <= 1); bn_select_words(r, carry, a /* r < 0 */, r /* r >= 0 */, num); return carry; } @@ -434,7 +401,7 @@ BN_ULONG bn_reduce_once_in_place(BN_ULONG *r, BN_ULONG carry, const BN_ULONG *m, BN_ULONG *tmp, size_t num) { // See |bn_reduce_once| for why this logic works. carry -= bn_sub_words(tmp, r, m, num); - assert(carry == 0 || carry == (BN_ULONG)-1); + declassify_assert(carry + 1 <= 1); bn_select_words(r, carry, r /* tmp < 0 */, tmp /* tmp >= 0 */, num); return carry; } @@ -504,7 +471,7 @@ int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder, // |divisor_min_bits| bits, the top |divisor_min_bits - 1| can be incorporated // without reductions. This significantly speeds up |RSA_check_key|. For // simplicity, we round down to a whole number of words. - assert(divisor_min_bits <= BN_num_bits(divisor)); + declassify_assert(divisor_min_bits <= BN_num_bits(divisor)); int initial_words = 0; if (divisor_min_bits > 0) { initial_words = (divisor_min_bits - 1) / BN_BITS2; diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c.inc similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c.inc index f5d5eeec..41c5dbdc 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c.inc @@ -39,7 +39,7 @@ static uint16_t mod_u16(uint32_t n, uint16_t d, uint32_t p, uint32_t m) { // Multiply and subtract to get the remainder. n -= d * t; - assert(n < d); + declassify_assert(n < d); return n; } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c.inc similarity index 94% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c.inc index 64b52572..b18520e7 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c.inc @@ -119,6 +119,50 @@ #include "internal.h" #include "rsaz_exp.h" +#if defined(OPENSSL_BN_ASM_MONT5) + +// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it +// by |ap| modulo |np|, and stores the result in |rp|. The values are |num| +// words long and represented in Montgomery form. |n0| is a pointer to the +// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least +// 16 bytes. |power| must be less than 32 and is treated as secret. +// +// WARNING: This function implements Almost Montgomery Multiplication from +// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. +// However, even if they are fully reduced, the output may not be. +static void bn_mul_mont_gather5( + BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power) { + if (bn_mulx4x_mont_gather5_capable(num)) { + bn_mulx4x_mont_gather5(rp, ap, table, np, n0, num, power); + } else if (bn_mul4x_mont_gather5_capable(num)) { + bn_mul4x_mont_gather5(rp, ap, table, np, n0, num, power); + } else { + bn_mul_mont_gather5_nohw(rp, ap, table, np, n0, num, power); + } +} + +// bn_power5 squares |ap| five times and multiplies it by the value stored at +// index |power| of |table|, modulo |np|. It stores the result in |rp|. The +// values are |num| words long and represented in Montgomery form. |n0| is a +// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible +// by 8. |power| must be less than 32 and is treated as secret. +// +// WARNING: This function implements Almost Montgomery Multiplication from +// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. +// However, even if they are fully reduced, the output may not be. +static void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, + const BN_ULONG *np, const BN_ULONG *n0, int num, + int power) { + assert(bn_power5_capable(num)); + if (bn_powerx5_capable(num)) { + bn_powerx5(rp, ap, table, np, n0, num, power); + } else { + bn_power5_nohw(rp, ap, table, np, n0, num, power); + } +} + +#endif // defined(OPENSSL_BN_ASM_MONT5) int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { int i, bits, ret = 0; @@ -1013,7 +1057,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, // Prepare a^1 in the Montgomery domain. assert(!a->neg); - assert(BN_ucmp(a, m) < 0); + declassify_assert(BN_ucmp(a, m) < 0); if (!BN_to_montgomery(&am, a, mont, ctx) || !bn_resize_words(&am, top)) { goto err; @@ -1079,7 +1123,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, // Scan the exponent one window at a time starting from the most // significant bits. - if (top & 7) { + if (!bn_power5_capable(top)) { while (bits >= 0) { for (wvalue = 0, i = 0; i < 5; i++, bits--) { wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c.inc similarity index 96% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c.inc index 8fe14379..249ffbf8 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c.inc @@ -93,7 +93,7 @@ static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x, // At least one of |u| and |v| is now even. BN_ULONG u_is_odd = word_is_odd_mask(u->d[0]); BN_ULONG v_is_odd = word_is_odd_mask(v->d[0]); - assert(!(u_is_odd & v_is_odd)); + declassify_assert(!(u_is_odd & v_is_odd)); // If both are even, the final GCD gains a factor of two. shift += 1 & (~u_is_odd & ~v_is_odd); @@ -106,7 +106,7 @@ static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x, // One of |u| or |v| is zero at this point. The algorithm usually makes |u| // zero, unless |y| was already zero on input. Fix this by combining the // values. - assert(BN_is_zero(u) || BN_is_zero(v)); + declassify_assert(BN_is_zero(u) | BN_is_zero(v)); for (size_t i = 0; i < width; i++) { v->d[i] |= u->d[i]; } @@ -289,7 +289,7 @@ int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, const BIGNUM *a, // and |v| is now even. BN_ULONG u_is_even = ~word_is_odd_mask(u->d[0]); BN_ULONG v_is_even = ~word_is_odd_mask(v->d[0]); - assert(u_is_even != v_is_even); + declassify_assert(u_is_even != v_is_even); // Halve the even one and adjust the corresponding coefficient. maybe_rshift1_words(u->d, u_is_even, tmp->d, n_width); @@ -313,8 +313,11 @@ int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, const BIGNUM *a, maybe_rshift1_words_carry(D->d, D_carry, v_is_even, tmp->d, a_width); } - assert(BN_is_zero(v)); - if (!BN_is_one(u)) { + declassify_assert(BN_is_zero(v)); + // While the inputs and output are secret, this function considers whether the + // input was invertible to be public. It is used as part of RSA key + // generation, where inputs are chosen to already be invertible. + if (constant_time_declassify_int(!BN_is_one(u))) { *out_no_inverse = 1; OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE); goto err; diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/generic.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/generic.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/generic.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/generic.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/internal.h index dd3ee9fc..570276a6 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/internal.h @@ -438,18 +438,26 @@ int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) #define OPENSSL_BN_ASM_MONT5 -// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it -// by |ap| modulo |np|, and stores the result in |rp|. The values are |num| -// words long and represented in Montgomery form. |n0| is a pointer to the -// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least -// 16 bytes. |power| must be less than 32 and is treated as secret. -// -// WARNING: This function implements Almost Montgomery Multiplication from -// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. -// However, even if they are fully reduced, the output may not be. -void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, - const BN_ULONG *table, const BN_ULONG *np, - const BN_ULONG *n0, int num, int power); +// The following functions implement |bn_mul_mont_gather5|. See +// |bn_mul_mont_gather5| for details. +OPENSSL_INLINE int bn_mul4x_mont_gather5_capable(int num) { + return (num & 7) == 0; +} +void bn_mul4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power); + +OPENSSL_INLINE int bn_mulx4x_mont_gather5_capable(int num) { + return bn_mul4x_mont_gather5_capable(num) && CRYPTO_is_ADX_capable() && + CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable(); +} +void bn_mulx4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power); + +void bn_mul_mont_gather5_nohw(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power); // bn_scatter5 stores |inp| to index |power| of |table|. |inp| and each entry of // |table| are |num| words long. |power| must be less than 32 and is treated as @@ -463,17 +471,19 @@ void bn_scatter5(const BN_ULONG *inp, size_t num, BN_ULONG *table, // is treated as secret. |table| must be aligned to at least 16 bytes. void bn_gather5(BN_ULONG *out, size_t num, const BN_ULONG *table, size_t power); -// bn_power5 squares |ap| five times and multiplies it by the value stored at -// index |power| of |table|, modulo |np|. It stores the result in |rp|. The -// values are |num| words long and represented in Montgomery form. |n0| is a -// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible -// by 8. |power| must be less than 32 and is treated as secret. -// -// WARNING: This function implements Almost Montgomery Multiplication from -// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. -// However, even if they are fully reduced, the output may not be. -void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, - const BN_ULONG *np, const BN_ULONG *n0, int num, int power); +// The following functions implement |bn_power5|. See |bn_power5| for details. +void bn_power5_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, + const BN_ULONG *np, const BN_ULONG *n0, int num, int power); + +OPENSSL_INLINE int bn_power5_capable(int num) { return (num & 7) == 0; } + +OPENSSL_INLINE int bn_powerx5_capable(int num) { + return bn_power5_capable(num) && CRYPTO_is_ADX_capable() && + CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable(); +} +void bn_powerx5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, + const BN_ULONG *np, const BN_ULONG *n0, int num, int power); + #endif // !OPENSSL_NO_ASM && OPENSSL_X86_64 uint64_t bn_mont_n0(const BIGNUM *n); diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/jacobi.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/jacobi.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/jacobi.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/jacobi.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c.inc similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c.inc index 1f71dd44..04a491eb 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c.inc @@ -153,7 +153,7 @@ static uint64_t bn_neg_inv_mod_r_u64(uint64_t n) { // The invariant now shows that u*r - v*n == 1 since r == 2 * alpha. #if BN_BITS2 == 64 && defined(BN_ULLONG) - assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta)); + declassify_assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta)); #endif return v; diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c.inc similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c.inc index 5ae6b49b..e7fd224d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c.inc @@ -292,7 +292,7 @@ static void bn_mul_recursive(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, } // The product should fit without carries. - assert(c == 0); + declassify_assert(c == 0); } // bn_mul_part_recursive sets |r| to |a| * |b|, using |t| as scratch space. |r| @@ -406,7 +406,7 @@ static void bn_mul_part_recursive(BN_ULONG *r, const BN_ULONG *a, } // The product should fit without carries. - assert(c == 0); + declassify_assert(c == 0); } // bn_mul_impl implements |BN_mul| and |bn_mul_consttime|. Note this function diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c.inc similarity index 97% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c.inc index d5891394..ee0ce622 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c.inc @@ -487,7 +487,10 @@ int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add, static int bn_trial_division(uint16_t *out, const BIGNUM *bn) { const size_t num_primes = num_trial_division_primes(bn); for (size_t i = 1; i < num_primes; i++) { - if (bn_mod_u16_consttime(bn, kPrimes[i]) == 0) { + // During RSA key generation, |bn| may be secret, but only if |bn| was + // prime, so it is safe to leak failed trial divisions. + if (constant_time_declassify_int(bn_mod_u16_consttime(bn, kPrimes[i]) == + 0)) { *out = kPrimes[i]; return 1; } @@ -573,7 +576,8 @@ int bn_miller_rabin_iteration(const BN_MILLER_RABIN *miller_rabin, // To avoid leaking |a|, we run the loop to |w_bits| and mask off all // iterations once |j| = |a|. for (int j = 1; j < miller_rabin->w_bits; j++) { - if (constant_time_eq_int(j, miller_rabin->a) & ~is_possibly_prime) { + if (constant_time_declassify_w(constant_time_eq_int(j, miller_rabin->a) & + ~is_possibly_prime)) { // If the loop is done and we haven't seen z = 1 or z = w-1 yet, the // value is composite and we can break in variable time. break; @@ -593,12 +597,14 @@ int bn_miller_rabin_iteration(const BN_MILLER_RABIN *miller_rabin, // Step 4.5.3. If z = 1 and the loop is not done, the previous value of z // was not -1. There are no non-trivial square roots of 1 modulo a prime, so // w is composite and we may exit in variable time. - if (BN_equal_consttime(z, miller_rabin->one_mont) & ~is_possibly_prime) { + if (constant_time_declassify_w( + BN_equal_consttime(z, miller_rabin->one_mont) & + ~is_possibly_prime)) { break; } } - *out_is_possibly_prime = is_possibly_prime & 1; + *out_is_possibly_prime = constant_time_declassify_w(is_possibly_prime) & 1; ret = 1; err: @@ -736,8 +742,9 @@ int BN_primality_test(int *out_is_probably_prime, const BIGNUM *w, int checks, crypto_word_t uniform_iterations = 0; // Using |constant_time_lt_w| seems to prevent the compiler from optimizing // this into two jumps. - for (int i = 1; (i <= BN_PRIME_CHECKS_BLINDED) | - constant_time_lt_w(uniform_iterations, checks); + for (int i = 1; constant_time_declassify_w( + (i <= BN_PRIME_CHECKS_BLINDED) | + constant_time_lt_w(uniform_iterations, checks)); i++) { // Step 4.1-4.2 int is_uniform; @@ -766,7 +773,7 @@ int BN_primality_test(int *out_is_probably_prime, const BIGNUM *w, int checks, } } - assert(uniform_iterations >= (crypto_word_t)checks); + declassify_assert(uniform_iterations >= (crypto_word_t)checks); *out_is_probably_prime = 1; ret = 1; diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c.inc similarity index 96% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c.inc index 91d337fd..5e25b93d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c.inc @@ -113,10 +113,9 @@ #include #include -#include #include "../../internal.h" -#include "../rand/internal.h" +#include "../bcm_interface.h" #include "../service_indicator/internal.h" #include "internal.h" @@ -157,7 +156,7 @@ int BN_rand(BIGNUM *rnd, int bits, int top, int bottom) { } FIPS_service_indicator_lock_state(); - RAND_bytes((uint8_t *)rnd->d, words * sizeof(BN_ULONG)); + BCM_rand_bytes((uint8_t *)rnd->d, words * sizeof(BN_ULONG)); FIPS_service_indicator_unlock_state(); rnd->d[words - 1] &= mask; @@ -225,8 +224,7 @@ static int bn_range_to_mask(size_t *out_words, BN_ULONG *out_mask, while (words > 0 && max_exclusive[words - 1] == 0) { words--; } - if (words == 0 || - (words == 1 && max_exclusive[0] <= min_inclusive)) { + if (words == 0 || (words == 1 && max_exclusive[0] <= min_inclusive)) { OPENSSL_PUT_ERROR(BN, BN_R_INVALID_RANGE); return 0; } @@ -275,8 +273,8 @@ int bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive, // Steps 4 and 5. Use |words| and |mask| together to obtain a string of N // bits, where N is the bit length of |max_exclusive|. FIPS_service_indicator_lock_state(); - RAND_bytes_with_additional_data((uint8_t *)out, words * sizeof(BN_ULONG), - additional_data); + BCM_rand_bytes_with_additional_data( + (uint8_t *)out, words * sizeof(BN_ULONG), additional_data); FIPS_service_indicator_unlock_state(); out[words - 1] &= mask; @@ -326,7 +324,7 @@ int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive, // Select a uniform random number with num_bits(max_exclusive) bits. FIPS_service_indicator_lock_state(); - RAND_bytes((uint8_t *)r->d, words * sizeof(BN_ULONG)); + BCM_rand_bytes((uint8_t *)r->d, words * sizeof(BN_ULONG)); FIPS_service_indicator_unlock_state(); r->d[words - 1] &= mask; @@ -339,7 +337,8 @@ int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive, // If the value is not in range, force it to be in range. r->d[0] |= constant_time_select_w(in_range, 0, min_inclusive); r->d[words - 1] &= constant_time_select_w(in_range, BN_MASK2, mask >> 1); - assert(bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words)); + declassify_assert( + bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words)); r->neg = 0; r->width = (int)words; diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/rsaz_exp.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/rsaz_exp.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/rsaz_exp.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/rsaz_exp.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/shift.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/shift.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/shift.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/shift.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c.inc similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c.inc index 13c7892c..b4b98158 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c.inc @@ -236,7 +236,7 @@ BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { goto end; } if (BN_ucmp(y, p) >= 0) { - if (!(p->neg ? BN_add : BN_sub)(y, y, p)) { + if (BN_usub(y, y, p)) { goto end; } } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-ios.ios.arm.S deleted file mode 100644 index c7bc35e8..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-ios.ios.arm.S +++ /dev/null @@ -1,1534 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__APPLE__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__) -@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. -@ -@ Licensed under the OpenSSL license (the "License"). You may not use -@ this file except in compliance with the License. You can obtain a copy -@ in the file LICENSE in the source distribution or at -@ https://www.openssl.org/source/license.html - - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ -@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel -@ of Linaro. Permission to use under GPL terms is granted. -@ ==================================================================== - -@ Bit-sliced AES for ARM NEON -@ -@ February 2012. -@ -@ This implementation is direct adaptation of bsaes-x86_64 module for -@ ARM NEON. Except that this module is endian-neutral [in sense that -@ it can be compiled for either endianness] by courtesy of vld1.8's -@ neutrality. Initial version doesn't implement interface to OpenSSL, -@ only low-level primitives and unsupported entry points, just enough -@ to collect performance results, which for Cortex-A8 core are: -@ -@ encrypt 19.5 cycles per byte processed with 128-bit key -@ decrypt 22.1 cycles per byte processed with 128-bit key -@ key conv. 440 cycles per 128-bit key/0.18 of 8x block -@ -@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, -@ which is [much] worse than anticipated (for further details see -@ http://www.openssl.org/~appro/Snapdragon-S4.html). -@ -@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code -@ manages in 20.0 cycles]. -@ -@ When comparing to x86_64 results keep in mind that NEON unit is -@ [mostly] single-issue and thus can't [fully] benefit from -@ instruction-level parallelism. And when comparing to aes-armv4 -@ results keep in mind key schedule conversion overhead (see -@ bsaes-x86_64.pl for further details)... -@ -@ - -@ April-August 2013 -@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. - -#ifndef __KERNEL__ -# include - -# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} -# define VFP_ABI_POP vldmia sp!,{d8-d15} -# define VFP_ABI_FRAME 0x40 -#else -# define VFP_ABI_PUSH -# define VFP_ABI_POP -# define VFP_ABI_FRAME 0 -# define BSAES_ASM_EXTENDED_KEY -# define XTS_CHAIN_TWEAK -# define __ARM_MAX_ARCH__ 7 -#endif - -#ifdef __thumb__ -# define adrl adr -#endif - -#if __ARM_MAX_ARCH__>=7 - - - -.text -.syntax unified @ ARMv7-capable assembler is expected to handle this -#if defined(__thumb2__) && !defined(__APPLE__) -.thumb -#else -.code 32 -# undef __thumb2__ -#endif - -#ifdef __thumb2__ -.thumb_func _bsaes_decrypt8 -#endif -.align 4 -_bsaes_decrypt8: - adr r6,. - vldmia r4!, {q9} @ round 0 key -#if defined(__thumb2__) || defined(__APPLE__) - adr r6,LM0ISR -#else - add r6,r6,#LM0ISR-_bsaes_decrypt8 -#endif - - vldmia r6!, {q8} @ LM0ISR - veor q10, q0, q9 @ xor with round0 key - veor q11, q1, q9 - vtbl.8 d0, {q10}, d16 - vtbl.8 d1, {q10}, d17 - veor q12, q2, q9 - vtbl.8 d2, {q11}, d16 - vtbl.8 d3, {q11}, d17 - veor q13, q3, q9 - vtbl.8 d4, {q12}, d16 - vtbl.8 d5, {q12}, d17 - veor q14, q4, q9 - vtbl.8 d6, {q13}, d16 - vtbl.8 d7, {q13}, d17 - veor q15, q5, q9 - vtbl.8 d8, {q14}, d16 - vtbl.8 d9, {q14}, d17 - veor q10, q6, q9 - vtbl.8 d10, {q15}, d16 - vtbl.8 d11, {q15}, d17 - veor q11, q7, q9 - vtbl.8 d12, {q10}, d16 - vtbl.8 d13, {q10}, d17 - vtbl.8 d14, {q11}, d16 - vtbl.8 d15, {q11}, d17 - vmov.i8 q8,#0x55 @ compose LBS0 - vmov.i8 q9,#0x33 @ compose LBS1 - vshr.u64 q10, q6, #1 - vshr.u64 q11, q4, #1 - veor q10, q10, q7 - veor q11, q11, q5 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #1 - veor q5, q5, q11 - vshl.u64 q11, q11, #1 - veor q6, q6, q10 - veor q4, q4, q11 - vshr.u64 q10, q2, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q3 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q3, q3, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q2, q2, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose LBS2 - vshr.u64 q10, q5, #2 - vshr.u64 q11, q4, #2 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q9 - vand q11, q11, q9 - veor q7, q7, q10 - vshl.u64 q10, q10, #2 - veor q6, q6, q11 - vshl.u64 q11, q11, #2 - veor q5, q5, q10 - veor q4, q4, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q3 - veor q11, q11, q2 - vand q10, q10, q9 - vand q11, q11, q9 - veor q3, q3, q10 - vshl.u64 q10, q10, #2 - veor q2, q2, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q3, #4 - vshr.u64 q11, q2, #4 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q6, q6, q11 - vshl.u64 q11, q11, #4 - veor q3, q3, q10 - veor q2, q2, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q5 - veor q11, q11, q4 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q4, q4, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - sub r5,r5,#1 - b Ldec_sbox -.align 4 -Ldec_loop: - vldmia r4!, {q8,q9,q10,q11} - veor q8, q8, q0 - veor q9, q9, q1 - vtbl.8 d0, {q8}, d24 - vtbl.8 d1, {q8}, d25 - vldmia r4!, {q8} - veor q10, q10, q2 - vtbl.8 d2, {q9}, d24 - vtbl.8 d3, {q9}, d25 - vldmia r4!, {q9} - veor q11, q11, q3 - vtbl.8 d4, {q10}, d24 - vtbl.8 d5, {q10}, d25 - vldmia r4!, {q10} - vtbl.8 d6, {q11}, d24 - vtbl.8 d7, {q11}, d25 - vldmia r4!, {q11} - veor q8, q8, q4 - veor q9, q9, q5 - vtbl.8 d8, {q8}, d24 - vtbl.8 d9, {q8}, d25 - veor q10, q10, q6 - vtbl.8 d10, {q9}, d24 - vtbl.8 d11, {q9}, d25 - veor q11, q11, q7 - vtbl.8 d12, {q10}, d24 - vtbl.8 d13, {q10}, d25 - vtbl.8 d14, {q11}, d24 - vtbl.8 d15, {q11}, d25 -Ldec_sbox: - veor q1, q1, q4 - veor q3, q3, q4 - - veor q4, q4, q7 - veor q1, q1, q6 - veor q2, q2, q7 - veor q6, q6, q4 - - veor q0, q0, q1 - veor q2, q2, q5 - veor q7, q7, q6 - veor q3, q3, q0 - veor q5, q5, q0 - veor q1, q1, q3 - veor q11, q3, q0 - veor q10, q7, q4 - veor q9, q1, q6 - veor q13, q4, q0 - vmov q8, q10 - veor q12, q5, q2 - - vorr q10, q10, q9 - veor q15, q11, q8 - vand q14, q11, q12 - vorr q11, q11, q12 - veor q12, q12, q9 - vand q8, q8, q9 - veor q9, q6, q2 - vand q15, q15, q12 - vand q13, q13, q9 - veor q9, q3, q7 - veor q12, q1, q5 - veor q11, q11, q13 - veor q10, q10, q13 - vand q13, q9, q12 - vorr q9, q9, q12 - veor q11, q11, q15 - veor q8, q8, q13 - veor q10, q10, q14 - veor q9, q9, q15 - veor q8, q8, q14 - vand q12, q4, q6 - veor q9, q9, q14 - vand q13, q0, q2 - vand q14, q7, q1 - vorr q15, q3, q5 - veor q11, q11, q12 - veor q9, q9, q14 - veor q8, q8, q15 - veor q10, q10, q13 - - @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 - - @ new smaller inversion - - vand q14, q11, q9 - vmov q12, q8 - - veor q13, q10, q14 - veor q15, q8, q14 - veor q14, q8, q14 @ q14=q15 - - vbsl q13, q9, q8 - vbsl q15, q11, q10 - veor q11, q11, q10 - - vbsl q12, q13, q14 - vbsl q8, q14, q13 - - vand q14, q12, q15 - veor q9, q9, q8 - - veor q14, q14, q11 - veor q12, q5, q2 - veor q8, q1, q6 - veor q10, q15, q14 - vand q10, q10, q5 - veor q5, q5, q1 - vand q11, q1, q15 - vand q5, q5, q14 - veor q1, q11, q10 - veor q5, q5, q11 - veor q15, q15, q13 - veor q14, q14, q9 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q2 - veor q12, q12, q8 - veor q2, q2, q6 - vand q8, q8, q15 - vand q6, q6, q13 - vand q12, q12, q14 - vand q2, q2, q9 - veor q8, q8, q12 - veor q2, q2, q6 - veor q12, q12, q11 - veor q6, q6, q10 - veor q5, q5, q12 - veor q2, q2, q12 - veor q1, q1, q8 - veor q6, q6, q8 - - veor q12, q3, q0 - veor q8, q7, q4 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q0 - veor q12, q12, q8 - veor q0, q0, q4 - vand q8, q8, q15 - vand q4, q4, q13 - vand q12, q12, q14 - vand q0, q0, q9 - veor q8, q8, q12 - veor q0, q0, q4 - veor q12, q12, q11 - veor q4, q4, q10 - veor q15, q15, q13 - veor q14, q14, q9 - veor q10, q15, q14 - vand q10, q10, q3 - veor q3, q3, q7 - vand q11, q7, q15 - vand q3, q3, q14 - veor q7, q11, q10 - veor q3, q3, q11 - veor q3, q3, q12 - veor q0, q0, q12 - veor q7, q7, q8 - veor q4, q4, q8 - veor q1, q1, q7 - veor q6, q6, q5 - - veor q4, q4, q1 - veor q2, q2, q7 - veor q5, q5, q7 - veor q4, q4, q2 - veor q7, q7, q0 - veor q4, q4, q5 - veor q3, q3, q6 - veor q6, q6, q1 - veor q3, q3, q4 - - veor q4, q4, q0 - veor q7, q7, q3 - subs r5,r5,#1 - bcc Ldec_done - @ multiplication by 0x05-0x00-0x04-0x00 - vext.8 q8, q0, q0, #8 - vext.8 q14, q3, q3, #8 - vext.8 q15, q5, q5, #8 - veor q8, q8, q0 - vext.8 q9, q1, q1, #8 - veor q14, q14, q3 - vext.8 q10, q6, q6, #8 - veor q15, q15, q5 - vext.8 q11, q4, q4, #8 - veor q9, q9, q1 - vext.8 q12, q2, q2, #8 - veor q10, q10, q6 - vext.8 q13, q7, q7, #8 - veor q11, q11, q4 - veor q12, q12, q2 - veor q13, q13, q7 - - veor q0, q0, q14 - veor q1, q1, q14 - veor q6, q6, q8 - veor q2, q2, q10 - veor q4, q4, q9 - veor q1, q1, q15 - veor q6, q6, q15 - veor q2, q2, q14 - veor q7, q7, q11 - veor q4, q4, q14 - veor q3, q3, q12 - veor q2, q2, q15 - veor q7, q7, q15 - veor q5, q5, q13 - vext.8 q8, q0, q0, #12 @ x0 <<< 32 - vext.8 q9, q1, q1, #12 - veor q0, q0, q8 @ x0 ^ (x0 <<< 32) - vext.8 q10, q6, q6, #12 - veor q1, q1, q9 - vext.8 q11, q4, q4, #12 - veor q6, q6, q10 - vext.8 q12, q2, q2, #12 - veor q4, q4, q11 - vext.8 q13, q7, q7, #12 - veor q2, q2, q12 - vext.8 q14, q3, q3, #12 - veor q7, q7, q13 - vext.8 q15, q5, q5, #12 - veor q3, q3, q14 - - veor q9, q9, q0 - veor q5, q5, q15 - vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) - veor q10, q10, q1 - veor q8, q8, q5 - veor q9, q9, q5 - vext.8 q1, q1, q1, #8 - veor q13, q13, q2 - veor q0, q0, q8 - veor q14, q14, q7 - veor q1, q1, q9 - vext.8 q8, q2, q2, #8 - veor q12, q12, q4 - vext.8 q9, q7, q7, #8 - veor q15, q15, q3 - vext.8 q2, q4, q4, #8 - veor q11, q11, q6 - vext.8 q7, q5, q5, #8 - veor q12, q12, q5 - vext.8 q4, q3, q3, #8 - veor q11, q11, q5 - vext.8 q3, q6, q6, #8 - veor q5, q9, q13 - veor q11, q11, q2 - veor q7, q7, q15 - veor q6, q4, q14 - veor q4, q8, q12 - veor q2, q3, q10 - vmov q3, q11 - @ vmov q5, q9 - vldmia r6, {q12} @ LISR - ite eq @ Thumb2 thing, sanity check in ARM - addeq r6,r6,#0x10 - bne Ldec_loop - vldmia r6, {q12} @ LISRM0 - b Ldec_loop -.align 4 -Ldec_done: - vmov.i8 q8,#0x55 @ compose LBS0 - vmov.i8 q9,#0x33 @ compose LBS1 - vshr.u64 q10, q3, #1 - vshr.u64 q11, q2, #1 - veor q10, q10, q5 - veor q11, q11, q7 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #1 - veor q7, q7, q11 - vshl.u64 q11, q11, #1 - veor q3, q3, q10 - veor q2, q2, q11 - vshr.u64 q10, q6, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q4 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q4, q4, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q6, q6, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose LBS2 - vshr.u64 q10, q7, #2 - vshr.u64 q11, q2, #2 - veor q10, q10, q5 - veor q11, q11, q3 - vand q10, q10, q9 - vand q11, q11, q9 - veor q5, q5, q10 - vshl.u64 q10, q10, #2 - veor q3, q3, q11 - vshl.u64 q11, q11, #2 - veor q7, q7, q10 - veor q2, q2, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q4 - veor q11, q11, q6 - vand q10, q10, q9 - vand q11, q11, q9 - veor q4, q4, q10 - vshl.u64 q10, q10, #2 - veor q6, q6, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q4, #4 - vshr.u64 q11, q6, #4 - veor q10, q10, q5 - veor q11, q11, q3 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q3, q3, q11 - vshl.u64 q11, q11, #4 - veor q4, q4, q10 - veor q6, q6, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q7 - veor q11, q11, q2 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q2, q2, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - vldmia r4, {q8} @ last round key - veor q6, q6, q8 - veor q4, q4, q8 - veor q2, q2, q8 - veor q7, q7, q8 - veor q3, q3, q8 - veor q5, q5, q8 - veor q0, q0, q8 - veor q1, q1, q8 - bx lr - - - -.align 6 -_bsaes_const: -LM0ISR:@ InvShiftRows constants -.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -LISR: -.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -LISRM0: -.quad 0x01040b0e0205080f, 0x0306090c00070a0d -LM0SR:@ ShiftRows constants -.quad 0x0a0e02060f03070b, 0x0004080c05090d01 -LSR: -.quad 0x0504070600030201, 0x0f0e0d0c0a09080b -LSRM0: -.quad 0x0304090e00050a0f, 0x01060b0c0207080d -LM0: -.quad 0x02060a0e03070b0f, 0x0004080c0105090d -LREVM0SR: -.quad 0x090d01050c000408, 0x03070b0f060a0e02 -.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 6 - - -#ifdef __thumb2__ -.thumb_func _bsaes_encrypt8 -#endif -.align 4 -_bsaes_encrypt8: - adr r6,. - vldmia r4!, {q9} @ round 0 key -#if defined(__thumb2__) || defined(__APPLE__) - adr r6,LM0SR -#else - sub r6,r6,#_bsaes_encrypt8-LM0SR -#endif - - vldmia r6!, {q8} @ LM0SR -_bsaes_encrypt8_alt: - veor q10, q0, q9 @ xor with round0 key - veor q11, q1, q9 - vtbl.8 d0, {q10}, d16 - vtbl.8 d1, {q10}, d17 - veor q12, q2, q9 - vtbl.8 d2, {q11}, d16 - vtbl.8 d3, {q11}, d17 - veor q13, q3, q9 - vtbl.8 d4, {q12}, d16 - vtbl.8 d5, {q12}, d17 - veor q14, q4, q9 - vtbl.8 d6, {q13}, d16 - vtbl.8 d7, {q13}, d17 - veor q15, q5, q9 - vtbl.8 d8, {q14}, d16 - vtbl.8 d9, {q14}, d17 - veor q10, q6, q9 - vtbl.8 d10, {q15}, d16 - vtbl.8 d11, {q15}, d17 - veor q11, q7, q9 - vtbl.8 d12, {q10}, d16 - vtbl.8 d13, {q10}, d17 - vtbl.8 d14, {q11}, d16 - vtbl.8 d15, {q11}, d17 -_bsaes_encrypt8_bitslice: - vmov.i8 q8,#0x55 @ compose LBS0 - vmov.i8 q9,#0x33 @ compose LBS1 - vshr.u64 q10, q6, #1 - vshr.u64 q11, q4, #1 - veor q10, q10, q7 - veor q11, q11, q5 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #1 - veor q5, q5, q11 - vshl.u64 q11, q11, #1 - veor q6, q6, q10 - veor q4, q4, q11 - vshr.u64 q10, q2, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q3 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q3, q3, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q2, q2, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose LBS2 - vshr.u64 q10, q5, #2 - vshr.u64 q11, q4, #2 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q9 - vand q11, q11, q9 - veor q7, q7, q10 - vshl.u64 q10, q10, #2 - veor q6, q6, q11 - vshl.u64 q11, q11, #2 - veor q5, q5, q10 - veor q4, q4, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q3 - veor q11, q11, q2 - vand q10, q10, q9 - vand q11, q11, q9 - veor q3, q3, q10 - vshl.u64 q10, q10, #2 - veor q2, q2, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q3, #4 - vshr.u64 q11, q2, #4 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q6, q6, q11 - vshl.u64 q11, q11, #4 - veor q3, q3, q10 - veor q2, q2, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q5 - veor q11, q11, q4 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q4, q4, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - sub r5,r5,#1 - b Lenc_sbox -.align 4 -Lenc_loop: - vldmia r4!, {q8,q9,q10,q11} - veor q8, q8, q0 - veor q9, q9, q1 - vtbl.8 d0, {q8}, d24 - vtbl.8 d1, {q8}, d25 - vldmia r4!, {q8} - veor q10, q10, q2 - vtbl.8 d2, {q9}, d24 - vtbl.8 d3, {q9}, d25 - vldmia r4!, {q9} - veor q11, q11, q3 - vtbl.8 d4, {q10}, d24 - vtbl.8 d5, {q10}, d25 - vldmia r4!, {q10} - vtbl.8 d6, {q11}, d24 - vtbl.8 d7, {q11}, d25 - vldmia r4!, {q11} - veor q8, q8, q4 - veor q9, q9, q5 - vtbl.8 d8, {q8}, d24 - vtbl.8 d9, {q8}, d25 - veor q10, q10, q6 - vtbl.8 d10, {q9}, d24 - vtbl.8 d11, {q9}, d25 - veor q11, q11, q7 - vtbl.8 d12, {q10}, d24 - vtbl.8 d13, {q10}, d25 - vtbl.8 d14, {q11}, d24 - vtbl.8 d15, {q11}, d25 -Lenc_sbox: - veor q2, q2, q1 - veor q5, q5, q6 - veor q3, q3, q0 - veor q6, q6, q2 - veor q5, q5, q0 - - veor q6, q6, q3 - veor q3, q3, q7 - veor q7, q7, q5 - veor q3, q3, q4 - veor q4, q4, q5 - - veor q2, q2, q7 - veor q3, q3, q1 - veor q1, q1, q5 - veor q11, q7, q4 - veor q10, q1, q2 - veor q9, q5, q3 - veor q13, q2, q4 - vmov q8, q10 - veor q12, q6, q0 - - vorr q10, q10, q9 - veor q15, q11, q8 - vand q14, q11, q12 - vorr q11, q11, q12 - veor q12, q12, q9 - vand q8, q8, q9 - veor q9, q3, q0 - vand q15, q15, q12 - vand q13, q13, q9 - veor q9, q7, q1 - veor q12, q5, q6 - veor q11, q11, q13 - veor q10, q10, q13 - vand q13, q9, q12 - vorr q9, q9, q12 - veor q11, q11, q15 - veor q8, q8, q13 - veor q10, q10, q14 - veor q9, q9, q15 - veor q8, q8, q14 - vand q12, q2, q3 - veor q9, q9, q14 - vand q13, q4, q0 - vand q14, q1, q5 - vorr q15, q7, q6 - veor q11, q11, q12 - veor q9, q9, q14 - veor q8, q8, q15 - veor q10, q10, q13 - - @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 - - @ new smaller inversion - - vand q14, q11, q9 - vmov q12, q8 - - veor q13, q10, q14 - veor q15, q8, q14 - veor q14, q8, q14 @ q14=q15 - - vbsl q13, q9, q8 - vbsl q15, q11, q10 - veor q11, q11, q10 - - vbsl q12, q13, q14 - vbsl q8, q14, q13 - - vand q14, q12, q15 - veor q9, q9, q8 - - veor q14, q14, q11 - veor q12, q6, q0 - veor q8, q5, q3 - veor q10, q15, q14 - vand q10, q10, q6 - veor q6, q6, q5 - vand q11, q5, q15 - vand q6, q6, q14 - veor q5, q11, q10 - veor q6, q6, q11 - veor q15, q15, q13 - veor q14, q14, q9 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q0 - veor q12, q12, q8 - veor q0, q0, q3 - vand q8, q8, q15 - vand q3, q3, q13 - vand q12, q12, q14 - vand q0, q0, q9 - veor q8, q8, q12 - veor q0, q0, q3 - veor q12, q12, q11 - veor q3, q3, q10 - veor q6, q6, q12 - veor q0, q0, q12 - veor q5, q5, q8 - veor q3, q3, q8 - - veor q12, q7, q4 - veor q8, q1, q2 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q4 - veor q12, q12, q8 - veor q4, q4, q2 - vand q8, q8, q15 - vand q2, q2, q13 - vand q12, q12, q14 - vand q4, q4, q9 - veor q8, q8, q12 - veor q4, q4, q2 - veor q12, q12, q11 - veor q2, q2, q10 - veor q15, q15, q13 - veor q14, q14, q9 - veor q10, q15, q14 - vand q10, q10, q7 - veor q7, q7, q1 - vand q11, q1, q15 - vand q7, q7, q14 - veor q1, q11, q10 - veor q7, q7, q11 - veor q7, q7, q12 - veor q4, q4, q12 - veor q1, q1, q8 - veor q2, q2, q8 - veor q7, q7, q0 - veor q1, q1, q6 - veor q6, q6, q0 - veor q4, q4, q7 - veor q0, q0, q1 - - veor q1, q1, q5 - veor q5, q5, q2 - veor q2, q2, q3 - veor q3, q3, q5 - veor q4, q4, q5 - - veor q6, q6, q3 - subs r5,r5,#1 - bcc Lenc_done - vext.8 q8, q0, q0, #12 @ x0 <<< 32 - vext.8 q9, q1, q1, #12 - veor q0, q0, q8 @ x0 ^ (x0 <<< 32) - vext.8 q10, q4, q4, #12 - veor q1, q1, q9 - vext.8 q11, q6, q6, #12 - veor q4, q4, q10 - vext.8 q12, q3, q3, #12 - veor q6, q6, q11 - vext.8 q13, q7, q7, #12 - veor q3, q3, q12 - vext.8 q14, q2, q2, #12 - veor q7, q7, q13 - vext.8 q15, q5, q5, #12 - veor q2, q2, q14 - - veor q9, q9, q0 - veor q5, q5, q15 - vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) - veor q10, q10, q1 - veor q8, q8, q5 - veor q9, q9, q5 - vext.8 q1, q1, q1, #8 - veor q13, q13, q3 - veor q0, q0, q8 - veor q14, q14, q7 - veor q1, q1, q9 - vext.8 q8, q3, q3, #8 - veor q12, q12, q6 - vext.8 q9, q7, q7, #8 - veor q15, q15, q2 - vext.8 q3, q6, q6, #8 - veor q11, q11, q4 - vext.8 q7, q5, q5, #8 - veor q12, q12, q5 - vext.8 q6, q2, q2, #8 - veor q11, q11, q5 - vext.8 q2, q4, q4, #8 - veor q5, q9, q13 - veor q4, q8, q12 - veor q3, q3, q11 - veor q7, q7, q15 - veor q6, q6, q14 - @ vmov q4, q8 - veor q2, q2, q10 - @ vmov q5, q9 - vldmia r6, {q12} @ LSR - ite eq @ Thumb2 thing, samity check in ARM - addeq r6,r6,#0x10 - bne Lenc_loop - vldmia r6, {q12} @ LSRM0 - b Lenc_loop -.align 4 -Lenc_done: - vmov.i8 q8,#0x55 @ compose LBS0 - vmov.i8 q9,#0x33 @ compose LBS1 - vshr.u64 q10, q2, #1 - vshr.u64 q11, q3, #1 - veor q10, q10, q5 - veor q11, q11, q7 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #1 - veor q7, q7, q11 - vshl.u64 q11, q11, #1 - veor q2, q2, q10 - veor q3, q3, q11 - vshr.u64 q10, q4, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q6 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q6, q6, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q4, q4, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose LBS2 - vshr.u64 q10, q7, #2 - vshr.u64 q11, q3, #2 - veor q10, q10, q5 - veor q11, q11, q2 - vand q10, q10, q9 - vand q11, q11, q9 - veor q5, q5, q10 - vshl.u64 q10, q10, #2 - veor q2, q2, q11 - vshl.u64 q11, q11, #2 - veor q7, q7, q10 - veor q3, q3, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q6 - veor q11, q11, q4 - vand q10, q10, q9 - vand q11, q11, q9 - veor q6, q6, q10 - vshl.u64 q10, q10, #2 - veor q4, q4, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q6, #4 - vshr.u64 q11, q4, #4 - veor q10, q10, q5 - veor q11, q11, q2 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q2, q2, q11 - vshl.u64 q11, q11, #4 - veor q6, q6, q10 - veor q4, q4, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q7 - veor q11, q11, q3 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q3, q3, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - vldmia r4, {q8} @ last round key - veor q4, q4, q8 - veor q6, q6, q8 - veor q3, q3, q8 - veor q7, q7, q8 - veor q2, q2, q8 - veor q5, q5, q8 - veor q0, q0, q8 - veor q1, q1, q8 - bx lr - -#ifdef __thumb2__ -.thumb_func _bsaes_key_convert -#endif -.align 4 -_bsaes_key_convert: - adr r6,. - vld1.8 {q7}, [r4]! @ load round 0 key -#if defined(__thumb2__) || defined(__APPLE__) - adr r6,LM0 -#else - sub r6,r6,#_bsaes_key_convert-LM0 -#endif - vld1.8 {q15}, [r4]! @ load round 1 key - - vmov.i8 q8, #0x01 @ bit masks - vmov.i8 q9, #0x02 - vmov.i8 q10, #0x04 - vmov.i8 q11, #0x08 - vmov.i8 q12, #0x10 - vmov.i8 q13, #0x20 - vldmia r6, {q14} @ LM0 - -#ifdef __ARMEL__ - vrev32.8 q7, q7 - vrev32.8 q15, q15 -#endif - sub r5,r5,#1 - vstmia r12!, {q7} @ save round 0 key - b Lkey_loop - -.align 4 -Lkey_loop: - vtbl.8 d14,{q15},d28 - vtbl.8 d15,{q15},d29 - vmov.i8 q6, #0x40 - vmov.i8 q15, #0x80 - - vtst.8 q0, q7, q8 - vtst.8 q1, q7, q9 - vtst.8 q2, q7, q10 - vtst.8 q3, q7, q11 - vtst.8 q4, q7, q12 - vtst.8 q5, q7, q13 - vtst.8 q6, q7, q6 - vtst.8 q7, q7, q15 - vld1.8 {q15}, [r4]! @ load next round key - vmvn q0, q0 @ "pnot" - vmvn q1, q1 - vmvn q5, q5 - vmvn q6, q6 -#ifdef __ARMEL__ - vrev32.8 q15, q15 -#endif - subs r5,r5,#1 - vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key - bne Lkey_loop - - vmov.i8 q7,#0x63 @ compose L63 - @ don't save last round key - bx lr - -.globl _bsaes_cbc_encrypt -.private_extern _bsaes_cbc_encrypt -#ifdef __thumb2__ -.thumb_func _bsaes_cbc_encrypt -#endif -.align 5 -_bsaes_cbc_encrypt: - @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for - @ short inputs. We patch this out, using bsaes for all input sizes. - - @ it is up to the caller to make sure we are called with enc == 0 - - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} - VFP_ABI_PUSH - ldr r8, [ip] @ IV is 1st arg on the stack - mov r2, r2, lsr#4 @ len in 16 byte blocks - sub sp, #0x10 @ scratch space to carry over the IV - mov r9, sp @ save sp - - ldr r10, [r3, #240] @ get # of rounds -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key - add r12, #96 @ sifze of bit-slices key schedule - - @ populate the key schedule - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - mov sp, r12 @ sp is sp - bl _bsaes_key_convert - vldmia sp, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia sp, {q7} -#else - ldr r12, [r3, #244] - eors r12, #1 - beq 0f - - @ populate the key schedule - str r12, [r3, #244] - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - add r12, r3, #248 @ pass key schedule - bl _bsaes_key_convert - add r4, r3, #248 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} - -.align 2 - -#endif - - vld1.8 {q15}, [r8] @ load IV - b Lcbc_dec_loop - -.align 4 -Lcbc_dec_loop: - subs r2, r2, #0x8 - bmi Lcbc_dec_loop_finish - - vld1.8 {q0,q1}, [r0]! @ load input - vld1.8 {q2,q3}, [r0]! -#ifndef BSAES_ASM_EXTENDED_KEY - mov r4, sp @ pass the key -#else - add r4, r3, #248 -#endif - vld1.8 {q4,q5}, [r0]! - mov r5, r10 - vld1.8 {q6,q7}, [r0] - sub r0, r0, #0x60 - vstmia r9, {q15} @ put aside IV - - bl _bsaes_decrypt8 - - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q12,q13}, [r0]! - veor q4, q4, q10 - veor q2, q2, q11 - vld1.8 {q14,q15}, [r0]! - veor q7, q7, q12 - vst1.8 {q0,q1}, [r1]! @ write output - veor q3, q3, q13 - vst1.8 {q6}, [r1]! - veor q5, q5, q14 - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - vst1.8 {q7}, [r1]! - vst1.8 {q3}, [r1]! - vst1.8 {q5}, [r1]! - - b Lcbc_dec_loop - -Lcbc_dec_loop_finish: - adds r2, r2, #8 - beq Lcbc_dec_done - - @ Set up most parameters for the _bsaes_decrypt8 call. -#ifndef BSAES_ASM_EXTENDED_KEY - mov r4, sp @ pass the key -#else - add r4, r3, #248 -#endif - mov r5, r10 - vstmia r9, {q15} @ put aside IV - - vld1.8 {q0}, [r0]! @ load input - cmp r2, #2 - blo Lcbc_dec_one - vld1.8 {q1}, [r0]! - beq Lcbc_dec_two - vld1.8 {q2}, [r0]! - cmp r2, #4 - blo Lcbc_dec_three - vld1.8 {q3}, [r0]! - beq Lcbc_dec_four - vld1.8 {q4}, [r0]! - cmp r2, #6 - blo Lcbc_dec_five - vld1.8 {q5}, [r0]! - beq Lcbc_dec_six - vld1.8 {q6}, [r0]! - sub r0, r0, #0x70 - - bl _bsaes_decrypt8 - - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q12,q13}, [r0]! - veor q4, q4, q10 - veor q2, q2, q11 - vld1.8 {q15}, [r0]! - veor q7, q7, q12 - vst1.8 {q0,q1}, [r1]! @ write output - veor q3, q3, q13 - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - vst1.8 {q7}, [r1]! - vst1.8 {q3}, [r1]! - b Lcbc_dec_done -.align 4 -Lcbc_dec_six: - sub r0, r0, #0x60 - bl _bsaes_decrypt8 - vldmia r9,{q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q12}, [r0]! - veor q4, q4, q10 - veor q2, q2, q11 - vld1.8 {q15}, [r0]! - veor q7, q7, q12 - vst1.8 {q0,q1}, [r1]! @ write output - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - vst1.8 {q7}, [r1]! - b Lcbc_dec_done -.align 4 -Lcbc_dec_five: - sub r0, r0, #0x50 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q15}, [r0]! - veor q4, q4, q10 - vst1.8 {q0,q1}, [r1]! @ write output - veor q2, q2, q11 - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - b Lcbc_dec_done -.align 4 -Lcbc_dec_four: - sub r0, r0, #0x40 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q15}, [r0]! - veor q4, q4, q10 - vst1.8 {q0,q1}, [r1]! @ write output - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - b Lcbc_dec_done -.align 4 -Lcbc_dec_three: - sub r0, r0, #0x30 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q15}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vst1.8 {q0,q1}, [r1]! @ write output - vst1.8 {q6}, [r1]! - b Lcbc_dec_done -.align 4 -Lcbc_dec_two: - sub r0, r0, #0x20 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q15}, [r0]! @ reload input - veor q1, q1, q8 - vst1.8 {q0,q1}, [r1]! @ write output - b Lcbc_dec_done -.align 4 -Lcbc_dec_one: - sub r0, r0, #0x10 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q15}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vst1.8 {q0}, [r1]! @ write output - -Lcbc_dec_done: -#ifndef BSAES_ASM_EXTENDED_KEY - vmov.i32 q0, #0 - vmov.i32 q1, #0 -Lcbc_dec_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r9 - bne Lcbc_dec_bzero -#endif - - mov sp, r9 - add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb - vst1.8 {q15}, [r8] @ return IV - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} - -.globl _bsaes_ctr32_encrypt_blocks -.private_extern _bsaes_ctr32_encrypt_blocks -#ifdef __thumb2__ -.thumb_func _bsaes_ctr32_encrypt_blocks -#endif -.align 5 -_bsaes_ctr32_encrypt_blocks: - @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this - @ out to retain a constant-time implementation. - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} - VFP_ABI_PUSH - ldr r8, [ip] @ ctr is 1st arg on the stack - sub sp, sp, #0x10 @ scratch space to carry over the ctr - mov r9, sp @ save sp - - ldr r10, [r3, #240] @ get # of rounds -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key - add r12, #96 @ size of bit-sliced key schedule - - @ populate the key schedule - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - mov sp, r12 @ sp is sp - bl _bsaes_key_convert - veor q7,q7,q15 @ fix up last round key - vstmia r12, {q7} @ save last round key - - vld1.8 {q0}, [r8] @ load counter -#ifdef __APPLE__ - mov r8, #:lower16:(LREVM0SR-LM0) - add r8, r6, r8 -#else - add r8, r6, #LREVM0SR-LM0 @ borrow r8 -#endif - vldmia sp, {q4} @ load round0 key -#else - ldr r12, [r3, #244] - eors r12, #1 - beq 0f - - @ populate the key schedule - str r12, [r3, #244] - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - add r12, r3, #248 @ pass key schedule - bl _bsaes_key_convert - veor q7,q7,q15 @ fix up last round key - vstmia r12, {q7} @ save last round key - -.align 2 - add r12, r3, #248 - vld1.8 {q0}, [r8] @ load counter - adrl r8, LREVM0SR @ borrow r8 - vldmia r12, {q4} @ load round0 key - sub sp, #0x10 @ place for adjusted round0 key -#endif - - vmov.i32 q8,#1 @ compose 1<<96 - veor q9,q9,q9 - vrev32.8 q0,q0 - vext.8 q8,q9,q8,#4 - vrev32.8 q4,q4 - vadd.u32 q9,q8,q8 @ compose 2<<96 - vstmia sp, {q4} @ save adjusted round0 key - b Lctr_enc_loop - -.align 4 -Lctr_enc_loop: - vadd.u32 q10, q8, q9 @ compose 3<<96 - vadd.u32 q1, q0, q8 @ +1 - vadd.u32 q2, q0, q9 @ +2 - vadd.u32 q3, q0, q10 @ +3 - vadd.u32 q4, q1, q10 - vadd.u32 q5, q2, q10 - vadd.u32 q6, q3, q10 - vadd.u32 q7, q4, q10 - vadd.u32 q10, q5, q10 @ next counter - - @ Borrow prologue from _bsaes_encrypt8 to use the opportunity - @ to flip byte order in 32-bit counter - - vldmia sp, {q9} @ load round0 key -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x10 @ pass next round key -#else - add r4, r3, #264 -#endif - vldmia r8, {q8} @ LREVM0SR - mov r5, r10 @ pass rounds - vstmia r9, {q10} @ save next counter -#ifdef __APPLE__ - mov r6, #:lower16:(LREVM0SR-LSR) - sub r6, r8, r6 -#else - sub r6, r8, #LREVM0SR-LSR @ pass constants -#endif - - bl _bsaes_encrypt8_alt - - subs r2, r2, #8 - blo Lctr_enc_loop_done - - vld1.8 {q8,q9}, [r0]! @ load input - vld1.8 {q10,q11}, [r0]! - veor q0, q8 - veor q1, q9 - vld1.8 {q12,q13}, [r0]! - veor q4, q10 - veor q6, q11 - vld1.8 {q14,q15}, [r0]! - veor q3, q12 - vst1.8 {q0,q1}, [r1]! @ write output - veor q7, q13 - veor q2, q14 - vst1.8 {q4}, [r1]! - veor q5, q15 - vst1.8 {q6}, [r1]! - vmov.i32 q8, #1 @ compose 1<<96 - vst1.8 {q3}, [r1]! - veor q9, q9, q9 - vst1.8 {q7}, [r1]! - vext.8 q8, q9, q8, #4 - vst1.8 {q2}, [r1]! - vadd.u32 q9,q8,q8 @ compose 2<<96 - vst1.8 {q5}, [r1]! - vldmia r9, {q0} @ load counter - - bne Lctr_enc_loop - b Lctr_enc_done - -.align 4 -Lctr_enc_loop_done: - add r2, r2, #8 - vld1.8 {q8}, [r0]! @ load input - veor q0, q8 - vst1.8 {q0}, [r1]! @ write output - cmp r2, #2 - blo Lctr_enc_done - vld1.8 {q9}, [r0]! - veor q1, q9 - vst1.8 {q1}, [r1]! - beq Lctr_enc_done - vld1.8 {q10}, [r0]! - veor q4, q10 - vst1.8 {q4}, [r1]! - cmp r2, #4 - blo Lctr_enc_done - vld1.8 {q11}, [r0]! - veor q6, q11 - vst1.8 {q6}, [r1]! - beq Lctr_enc_done - vld1.8 {q12}, [r0]! - veor q3, q12 - vst1.8 {q3}, [r1]! - cmp r2, #6 - blo Lctr_enc_done - vld1.8 {q13}, [r0]! - veor q7, q13 - vst1.8 {q7}, [r1]! - beq Lctr_enc_done - vld1.8 {q14}, [r0] - veor q2, q14 - vst1.8 {q2}, [r1]! - -Lctr_enc_done: - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifndef BSAES_ASM_EXTENDED_KEY -Lctr_enc_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r9 - bne Lctr_enc_bzero -#else - vstmia sp, {q0,q1} -#endif - - mov sp, r9 - add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - - @ OpenSSL contains aes_nohw_* fallback code here. We patch this - @ out to retain a constant-time implementation. - -#endif -#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__) -#endif // defined(__arm__) && defined(__APPLE__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/aead.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/aead.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/aead.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/aead.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/cipher.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/cipher.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/cipher.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/cipher.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c.inc similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c.inc index 12387394..57f8d16e 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c.inc @@ -56,11 +56,11 @@ #include #include #include -#include #include "internal.h" #include "../../internal.h" #include "../aes/internal.h" +#include "../bcm_interface.h" #include "../modes/internal.h" #include "../service_indicator/internal.h" #include "../delocate.h" @@ -471,11 +471,11 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) { } OPENSSL_memcpy(gctx->iv, ptr, arg); if (c->encrypt) { - // |RAND_bytes| calls within the fipsmodule should be wrapped with state - // lock functions to avoid updating the service indicator with the DRBG - // functions. + // |BCM_rand_bytes| calls within the fipsmodule should be wrapped with + // state lock functions to avoid updating the service indicator with the + // DRBG functions. FIPS_service_indicator_lock_state(); - RAND_bytes(gctx->iv + arg, gctx->ivlen - arg); + BCM_rand_bytes(gctx->iv + arg, gctx->ivlen - arg); FIPS_service_indicator_unlock_state(); } gctx->iv_gen = 1; @@ -1167,10 +1167,11 @@ static int aead_aes_gcm_seal_scatter_randnonce( return 0; } - // |RAND_bytes| calls within the fipsmodule should be wrapped with state lock - // functions to avoid updating the service indicator with the DRBG functions. + // |BCM_rand_bytes| calls within the fipsmodule should be wrapped with state + // lock functions to avoid updating the service indicator with the DRBG + // functions. FIPS_service_indicator_lock_state(); - RAND_bytes(nonce, sizeof(nonce)); + BCM_rand_bytes(nonce, sizeof(nonce)); FIPS_service_indicator_unlock_state(); const struct aead_aes_gcm_ctx *gcm_ctx = diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aesccm.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aesccm.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aesccm.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aesccm.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/internal.h index fe960b63..3f00614c 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/internal.h @@ -134,9 +134,6 @@ struct evp_cipher_st { // flags contains the OR of a number of flags. See |EVP_CIPH_*|. uint32_t flags; - // app_data is a pointer to opaque, user data. - void *app_data; - int (*init)(EVP_CIPHER_CTX *ctx, const uint8_t *key, const uint8_t *iv, int enc); diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cmac/cmac.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cmac/cmac.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/cmac/cmac.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/cmac/cmac.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-windows.windows.x86.S deleted file mode 100644 index 763579fd..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-windows.windows.x86.S +++ /dev/null @@ -1,1270 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -global _bn_mul_comba8 -align 16 -_bn_mul_comba8: -L$_bn_mul_comba8_begin: - push esi - mov esi,DWORD [12+esp] - push edi - mov edi,DWORD [20+esp] - push ebp - push ebx - xor ebx,ebx - mov eax,DWORD [esi] - xor ecx,ecx - mov edx,DWORD [edi] - ; ################## Calculate word 0 - xor ebp,ebp - ; mul a[0]*b[0] - mul edx - add ebx,eax - mov eax,DWORD [20+esp] - adc ecx,edx - mov edx,DWORD [edi] - adc ebp,0 - mov DWORD [eax],ebx - mov eax,DWORD [4+esi] - ; saved r[0] - ; ################## Calculate word 1 - xor ebx,ebx - ; mul a[1]*b[0] - mul edx - add ecx,eax - mov eax,DWORD [esi] - adc ebp,edx - mov edx,DWORD [4+edi] - adc ebx,0 - ; mul a[0]*b[1] - mul edx - add ecx,eax - mov eax,DWORD [20+esp] - adc ebp,edx - mov edx,DWORD [edi] - adc ebx,0 - mov DWORD [4+eax],ecx - mov eax,DWORD [8+esi] - ; saved r[1] - ; ################## Calculate word 2 - xor ecx,ecx - ; mul a[2]*b[0] - mul edx - add ebp,eax - mov eax,DWORD [4+esi] - adc ebx,edx - mov edx,DWORD [4+edi] - adc ecx,0 - ; mul a[1]*b[1] - mul edx - add ebp,eax - mov eax,DWORD [esi] - adc ebx,edx - mov edx,DWORD [8+edi] - adc ecx,0 - ; mul a[0]*b[2] - mul edx - add ebp,eax - mov eax,DWORD [20+esp] - adc ebx,edx - mov edx,DWORD [edi] - adc ecx,0 - mov DWORD [8+eax],ebp - mov eax,DWORD [12+esi] - ; saved r[2] - ; ################## Calculate word 3 - xor ebp,ebp - ; mul a[3]*b[0] - mul edx - add ebx,eax - mov eax,DWORD [8+esi] - adc ecx,edx - mov edx,DWORD [4+edi] - adc ebp,0 - ; mul a[2]*b[1] - mul edx - add ebx,eax - mov eax,DWORD [4+esi] - adc ecx,edx - mov edx,DWORD [8+edi] - adc ebp,0 - ; mul a[1]*b[2] - mul edx - add ebx,eax - mov eax,DWORD [esi] - adc ecx,edx - mov edx,DWORD [12+edi] - adc ebp,0 - ; mul a[0]*b[3] - mul edx - add ebx,eax - mov eax,DWORD [20+esp] - adc ecx,edx - mov edx,DWORD [edi] - adc ebp,0 - mov DWORD [12+eax],ebx - mov eax,DWORD [16+esi] - ; saved r[3] - ; ################## Calculate word 4 - xor ebx,ebx - ; mul a[4]*b[0] - mul edx - add ecx,eax - mov eax,DWORD [12+esi] - adc ebp,edx - mov edx,DWORD [4+edi] - adc ebx,0 - ; mul a[3]*b[1] - mul edx - add ecx,eax - mov eax,DWORD [8+esi] - adc ebp,edx - mov edx,DWORD [8+edi] - adc ebx,0 - ; mul a[2]*b[2] - mul edx - add ecx,eax - mov eax,DWORD [4+esi] - adc ebp,edx - mov edx,DWORD [12+edi] - adc ebx,0 - ; mul a[1]*b[3] - mul edx - add ecx,eax - mov eax,DWORD [esi] - adc ebp,edx - mov edx,DWORD [16+edi] - adc ebx,0 - ; mul a[0]*b[4] - mul edx - add ecx,eax - mov eax,DWORD [20+esp] - adc ebp,edx - mov edx,DWORD [edi] - adc ebx,0 - mov DWORD [16+eax],ecx - mov eax,DWORD [20+esi] - ; saved r[4] - ; ################## Calculate word 5 - xor ecx,ecx - ; mul a[5]*b[0] - mul edx - add ebp,eax - mov eax,DWORD [16+esi] - adc ebx,edx - mov edx,DWORD [4+edi] - adc ecx,0 - ; mul a[4]*b[1] - mul edx - add ebp,eax - mov eax,DWORD [12+esi] - adc ebx,edx - mov edx,DWORD [8+edi] - adc ecx,0 - ; mul a[3]*b[2] - mul edx - add ebp,eax - mov eax,DWORD [8+esi] - adc ebx,edx - mov edx,DWORD [12+edi] - adc ecx,0 - ; mul a[2]*b[3] - mul edx - add ebp,eax - mov eax,DWORD [4+esi] - adc ebx,edx - mov edx,DWORD [16+edi] - adc ecx,0 - ; mul a[1]*b[4] - mul edx - add ebp,eax - mov eax,DWORD [esi] - adc ebx,edx - mov edx,DWORD [20+edi] - adc ecx,0 - ; mul a[0]*b[5] - mul edx - add ebp,eax - mov eax,DWORD [20+esp] - adc ebx,edx - mov edx,DWORD [edi] - adc ecx,0 - mov DWORD [20+eax],ebp - mov eax,DWORD [24+esi] - ; saved r[5] - ; ################## Calculate word 6 - xor ebp,ebp - ; mul a[6]*b[0] - mul edx - add ebx,eax - mov eax,DWORD [20+esi] - adc ecx,edx - mov edx,DWORD [4+edi] - adc ebp,0 - ; mul a[5]*b[1] - mul edx - add ebx,eax - mov eax,DWORD [16+esi] - adc ecx,edx - mov edx,DWORD [8+edi] - adc ebp,0 - ; mul a[4]*b[2] - mul edx - add ebx,eax - mov eax,DWORD [12+esi] - adc ecx,edx - mov edx,DWORD [12+edi] - adc ebp,0 - ; mul a[3]*b[3] - mul edx - add ebx,eax - mov eax,DWORD [8+esi] - adc ecx,edx - mov edx,DWORD [16+edi] - adc ebp,0 - ; mul a[2]*b[4] - mul edx - add ebx,eax - mov eax,DWORD [4+esi] - adc ecx,edx - mov edx,DWORD [20+edi] - adc ebp,0 - ; mul a[1]*b[5] - mul edx - add ebx,eax - mov eax,DWORD [esi] - adc ecx,edx - mov edx,DWORD [24+edi] - adc ebp,0 - ; mul a[0]*b[6] - mul edx - add ebx,eax - mov eax,DWORD [20+esp] - adc ecx,edx - mov edx,DWORD [edi] - adc ebp,0 - mov DWORD [24+eax],ebx - mov eax,DWORD [28+esi] - ; saved r[6] - ; ################## Calculate word 7 - xor ebx,ebx - ; mul a[7]*b[0] - mul edx - add ecx,eax - mov eax,DWORD [24+esi] - adc ebp,edx - mov edx,DWORD [4+edi] - adc ebx,0 - ; mul a[6]*b[1] - mul edx - add ecx,eax - mov eax,DWORD [20+esi] - adc ebp,edx - mov edx,DWORD [8+edi] - adc ebx,0 - ; mul a[5]*b[2] - mul edx - add ecx,eax - mov eax,DWORD [16+esi] - adc ebp,edx - mov edx,DWORD [12+edi] - adc ebx,0 - ; mul a[4]*b[3] - mul edx - add ecx,eax - mov eax,DWORD [12+esi] - adc ebp,edx - mov edx,DWORD [16+edi] - adc ebx,0 - ; mul a[3]*b[4] - mul edx - add ecx,eax - mov eax,DWORD [8+esi] - adc ebp,edx - mov edx,DWORD [20+edi] - adc ebx,0 - ; mul a[2]*b[5] - mul edx - add ecx,eax - mov eax,DWORD [4+esi] - adc ebp,edx - mov edx,DWORD [24+edi] - adc ebx,0 - ; mul a[1]*b[6] - mul edx - add ecx,eax - mov eax,DWORD [esi] - adc ebp,edx - mov edx,DWORD [28+edi] - adc ebx,0 - ; mul a[0]*b[7] - mul edx - add ecx,eax - mov eax,DWORD [20+esp] - adc ebp,edx - mov edx,DWORD [4+edi] - adc ebx,0 - mov DWORD [28+eax],ecx - mov eax,DWORD [28+esi] - ; saved r[7] - ; ################## Calculate word 8 - xor ecx,ecx - ; mul a[7]*b[1] - mul edx - add ebp,eax - mov eax,DWORD [24+esi] - adc ebx,edx - mov edx,DWORD [8+edi] - adc ecx,0 - ; mul a[6]*b[2] - mul edx - add ebp,eax - mov eax,DWORD [20+esi] - adc ebx,edx - mov edx,DWORD [12+edi] - adc ecx,0 - ; mul a[5]*b[3] - mul edx - add ebp,eax - mov eax,DWORD [16+esi] - adc ebx,edx - mov edx,DWORD [16+edi] - adc ecx,0 - ; mul a[4]*b[4] - mul edx - add ebp,eax - mov eax,DWORD [12+esi] - adc ebx,edx - mov edx,DWORD [20+edi] - adc ecx,0 - ; mul a[3]*b[5] - mul edx - add ebp,eax - mov eax,DWORD [8+esi] - adc ebx,edx - mov edx,DWORD [24+edi] - adc ecx,0 - ; mul a[2]*b[6] - mul edx - add ebp,eax - mov eax,DWORD [4+esi] - adc ebx,edx - mov edx,DWORD [28+edi] - adc ecx,0 - ; mul a[1]*b[7] - mul edx - add ebp,eax - mov eax,DWORD [20+esp] - adc ebx,edx - mov edx,DWORD [8+edi] - adc ecx,0 - mov DWORD [32+eax],ebp - mov eax,DWORD [28+esi] - ; saved r[8] - ; ################## Calculate word 9 - xor ebp,ebp - ; mul a[7]*b[2] - mul edx - add ebx,eax - mov eax,DWORD [24+esi] - adc ecx,edx - mov edx,DWORD [12+edi] - adc ebp,0 - ; mul a[6]*b[3] - mul edx - add ebx,eax - mov eax,DWORD [20+esi] - adc ecx,edx - mov edx,DWORD [16+edi] - adc ebp,0 - ; mul a[5]*b[4] - mul edx - add ebx,eax - mov eax,DWORD [16+esi] - adc ecx,edx - mov edx,DWORD [20+edi] - adc ebp,0 - ; mul a[4]*b[5] - mul edx - add ebx,eax - mov eax,DWORD [12+esi] - adc ecx,edx - mov edx,DWORD [24+edi] - adc ebp,0 - ; mul a[3]*b[6] - mul edx - add ebx,eax - mov eax,DWORD [8+esi] - adc ecx,edx - mov edx,DWORD [28+edi] - adc ebp,0 - ; mul a[2]*b[7] - mul edx - add ebx,eax - mov eax,DWORD [20+esp] - adc ecx,edx - mov edx,DWORD [12+edi] - adc ebp,0 - mov DWORD [36+eax],ebx - mov eax,DWORD [28+esi] - ; saved r[9] - ; ################## Calculate word 10 - xor ebx,ebx - ; mul a[7]*b[3] - mul edx - add ecx,eax - mov eax,DWORD [24+esi] - adc ebp,edx - mov edx,DWORD [16+edi] - adc ebx,0 - ; mul a[6]*b[4] - mul edx - add ecx,eax - mov eax,DWORD [20+esi] - adc ebp,edx - mov edx,DWORD [20+edi] - adc ebx,0 - ; mul a[5]*b[5] - mul edx - add ecx,eax - mov eax,DWORD [16+esi] - adc ebp,edx - mov edx,DWORD [24+edi] - adc ebx,0 - ; mul a[4]*b[6] - mul edx - add ecx,eax - mov eax,DWORD [12+esi] - adc ebp,edx - mov edx,DWORD [28+edi] - adc ebx,0 - ; mul a[3]*b[7] - mul edx - add ecx,eax - mov eax,DWORD [20+esp] - adc ebp,edx - mov edx,DWORD [16+edi] - adc ebx,0 - mov DWORD [40+eax],ecx - mov eax,DWORD [28+esi] - ; saved r[10] - ; ################## Calculate word 11 - xor ecx,ecx - ; mul a[7]*b[4] - mul edx - add ebp,eax - mov eax,DWORD [24+esi] - adc ebx,edx - mov edx,DWORD [20+edi] - adc ecx,0 - ; mul a[6]*b[5] - mul edx - add ebp,eax - mov eax,DWORD [20+esi] - adc ebx,edx - mov edx,DWORD [24+edi] - adc ecx,0 - ; mul a[5]*b[6] - mul edx - add ebp,eax - mov eax,DWORD [16+esi] - adc ebx,edx - mov edx,DWORD [28+edi] - adc ecx,0 - ; mul a[4]*b[7] - mul edx - add ebp,eax - mov eax,DWORD [20+esp] - adc ebx,edx - mov edx,DWORD [20+edi] - adc ecx,0 - mov DWORD [44+eax],ebp - mov eax,DWORD [28+esi] - ; saved r[11] - ; ################## Calculate word 12 - xor ebp,ebp - ; mul a[7]*b[5] - mul edx - add ebx,eax - mov eax,DWORD [24+esi] - adc ecx,edx - mov edx,DWORD [24+edi] - adc ebp,0 - ; mul a[6]*b[6] - mul edx - add ebx,eax - mov eax,DWORD [20+esi] - adc ecx,edx - mov edx,DWORD [28+edi] - adc ebp,0 - ; mul a[5]*b[7] - mul edx - add ebx,eax - mov eax,DWORD [20+esp] - adc ecx,edx - mov edx,DWORD [24+edi] - adc ebp,0 - mov DWORD [48+eax],ebx - mov eax,DWORD [28+esi] - ; saved r[12] - ; ################## Calculate word 13 - xor ebx,ebx - ; mul a[7]*b[6] - mul edx - add ecx,eax - mov eax,DWORD [24+esi] - adc ebp,edx - mov edx,DWORD [28+edi] - adc ebx,0 - ; mul a[6]*b[7] - mul edx - add ecx,eax - mov eax,DWORD [20+esp] - adc ebp,edx - mov edx,DWORD [28+edi] - adc ebx,0 - mov DWORD [52+eax],ecx - mov eax,DWORD [28+esi] - ; saved r[13] - ; ################## Calculate word 14 - xor ecx,ecx - ; mul a[7]*b[7] - mul edx - add ebp,eax - mov eax,DWORD [20+esp] - adc ebx,edx - adc ecx,0 - mov DWORD [56+eax],ebp - ; saved r[14] - ; save r[15] - mov DWORD [60+eax],ebx - pop ebx - pop ebp - pop edi - pop esi - ret -global _bn_mul_comba4 -align 16 -_bn_mul_comba4: -L$_bn_mul_comba4_begin: - push esi - mov esi,DWORD [12+esp] - push edi - mov edi,DWORD [20+esp] - push ebp - push ebx - xor ebx,ebx - mov eax,DWORD [esi] - xor ecx,ecx - mov edx,DWORD [edi] - ; ################## Calculate word 0 - xor ebp,ebp - ; mul a[0]*b[0] - mul edx - add ebx,eax - mov eax,DWORD [20+esp] - adc ecx,edx - mov edx,DWORD [edi] - adc ebp,0 - mov DWORD [eax],ebx - mov eax,DWORD [4+esi] - ; saved r[0] - ; ################## Calculate word 1 - xor ebx,ebx - ; mul a[1]*b[0] - mul edx - add ecx,eax - mov eax,DWORD [esi] - adc ebp,edx - mov edx,DWORD [4+edi] - adc ebx,0 - ; mul a[0]*b[1] - mul edx - add ecx,eax - mov eax,DWORD [20+esp] - adc ebp,edx - mov edx,DWORD [edi] - adc ebx,0 - mov DWORD [4+eax],ecx - mov eax,DWORD [8+esi] - ; saved r[1] - ; ################## Calculate word 2 - xor ecx,ecx - ; mul a[2]*b[0] - mul edx - add ebp,eax - mov eax,DWORD [4+esi] - adc ebx,edx - mov edx,DWORD [4+edi] - adc ecx,0 - ; mul a[1]*b[1] - mul edx - add ebp,eax - mov eax,DWORD [esi] - adc ebx,edx - mov edx,DWORD [8+edi] - adc ecx,0 - ; mul a[0]*b[2] - mul edx - add ebp,eax - mov eax,DWORD [20+esp] - adc ebx,edx - mov edx,DWORD [edi] - adc ecx,0 - mov DWORD [8+eax],ebp - mov eax,DWORD [12+esi] - ; saved r[2] - ; ################## Calculate word 3 - xor ebp,ebp - ; mul a[3]*b[0] - mul edx - add ebx,eax - mov eax,DWORD [8+esi] - adc ecx,edx - mov edx,DWORD [4+edi] - adc ebp,0 - ; mul a[2]*b[1] - mul edx - add ebx,eax - mov eax,DWORD [4+esi] - adc ecx,edx - mov edx,DWORD [8+edi] - adc ebp,0 - ; mul a[1]*b[2] - mul edx - add ebx,eax - mov eax,DWORD [esi] - adc ecx,edx - mov edx,DWORD [12+edi] - adc ebp,0 - ; mul a[0]*b[3] - mul edx - add ebx,eax - mov eax,DWORD [20+esp] - adc ecx,edx - mov edx,DWORD [4+edi] - adc ebp,0 - mov DWORD [12+eax],ebx - mov eax,DWORD [12+esi] - ; saved r[3] - ; ################## Calculate word 4 - xor ebx,ebx - ; mul a[3]*b[1] - mul edx - add ecx,eax - mov eax,DWORD [8+esi] - adc ebp,edx - mov edx,DWORD [8+edi] - adc ebx,0 - ; mul a[2]*b[2] - mul edx - add ecx,eax - mov eax,DWORD [4+esi] - adc ebp,edx - mov edx,DWORD [12+edi] - adc ebx,0 - ; mul a[1]*b[3] - mul edx - add ecx,eax - mov eax,DWORD [20+esp] - adc ebp,edx - mov edx,DWORD [8+edi] - adc ebx,0 - mov DWORD [16+eax],ecx - mov eax,DWORD [12+esi] - ; saved r[4] - ; ################## Calculate word 5 - xor ecx,ecx - ; mul a[3]*b[2] - mul edx - add ebp,eax - mov eax,DWORD [8+esi] - adc ebx,edx - mov edx,DWORD [12+edi] - adc ecx,0 - ; mul a[2]*b[3] - mul edx - add ebp,eax - mov eax,DWORD [20+esp] - adc ebx,edx - mov edx,DWORD [12+edi] - adc ecx,0 - mov DWORD [20+eax],ebp - mov eax,DWORD [12+esi] - ; saved r[5] - ; ################## Calculate word 6 - xor ebp,ebp - ; mul a[3]*b[3] - mul edx - add ebx,eax - mov eax,DWORD [20+esp] - adc ecx,edx - adc ebp,0 - mov DWORD [24+eax],ebx - ; saved r[6] - ; save r[7] - mov DWORD [28+eax],ecx - pop ebx - pop ebp - pop edi - pop esi - ret -global _bn_sqr_comba8 -align 16 -_bn_sqr_comba8: -L$_bn_sqr_comba8_begin: - push esi - push edi - push ebp - push ebx - mov edi,DWORD [20+esp] - mov esi,DWORD [24+esp] - xor ebx,ebx - xor ecx,ecx - mov eax,DWORD [esi] - ; ############### Calculate word 0 - xor ebp,ebp - ; sqr a[0]*a[0] - mul eax - add ebx,eax - adc ecx,edx - mov edx,DWORD [esi] - adc ebp,0 - mov DWORD [edi],ebx - mov eax,DWORD [4+esi] - ; saved r[0] - ; ############### Calculate word 1 - xor ebx,ebx - ; sqr a[1]*a[0] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [8+esi] - adc ebx,0 - mov DWORD [4+edi],ecx - mov edx,DWORD [esi] - ; saved r[1] - ; ############### Calculate word 2 - xor ecx,ecx - ; sqr a[2]*a[0] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [4+esi] - adc ecx,0 - ; sqr a[1]*a[1] - mul eax - add ebp,eax - adc ebx,edx - mov edx,DWORD [esi] - adc ecx,0 - mov DWORD [8+edi],ebp - mov eax,DWORD [12+esi] - ; saved r[2] - ; ############### Calculate word 3 - xor ebp,ebp - ; sqr a[3]*a[0] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [8+esi] - adc ebp,0 - mov edx,DWORD [4+esi] - ; sqr a[2]*a[1] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [16+esi] - adc ebp,0 - mov DWORD [12+edi],ebx - mov edx,DWORD [esi] - ; saved r[3] - ; ############### Calculate word 4 - xor ebx,ebx - ; sqr a[4]*a[0] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [12+esi] - adc ebx,0 - mov edx,DWORD [4+esi] - ; sqr a[3]*a[1] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [8+esi] - adc ebx,0 - ; sqr a[2]*a[2] - mul eax - add ecx,eax - adc ebp,edx - mov edx,DWORD [esi] - adc ebx,0 - mov DWORD [16+edi],ecx - mov eax,DWORD [20+esi] - ; saved r[4] - ; ############### Calculate word 5 - xor ecx,ecx - ; sqr a[5]*a[0] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [16+esi] - adc ecx,0 - mov edx,DWORD [4+esi] - ; sqr a[4]*a[1] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [12+esi] - adc ecx,0 - mov edx,DWORD [8+esi] - ; sqr a[3]*a[2] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [24+esi] - adc ecx,0 - mov DWORD [20+edi],ebp - mov edx,DWORD [esi] - ; saved r[5] - ; ############### Calculate word 6 - xor ebp,ebp - ; sqr a[6]*a[0] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [20+esi] - adc ebp,0 - mov edx,DWORD [4+esi] - ; sqr a[5]*a[1] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [16+esi] - adc ebp,0 - mov edx,DWORD [8+esi] - ; sqr a[4]*a[2] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [12+esi] - adc ebp,0 - ; sqr a[3]*a[3] - mul eax - add ebx,eax - adc ecx,edx - mov edx,DWORD [esi] - adc ebp,0 - mov DWORD [24+edi],ebx - mov eax,DWORD [28+esi] - ; saved r[6] - ; ############### Calculate word 7 - xor ebx,ebx - ; sqr a[7]*a[0] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [24+esi] - adc ebx,0 - mov edx,DWORD [4+esi] - ; sqr a[6]*a[1] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [20+esi] - adc ebx,0 - mov edx,DWORD [8+esi] - ; sqr a[5]*a[2] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [16+esi] - adc ebx,0 - mov edx,DWORD [12+esi] - ; sqr a[4]*a[3] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [28+esi] - adc ebx,0 - mov DWORD [28+edi],ecx - mov edx,DWORD [4+esi] - ; saved r[7] - ; ############### Calculate word 8 - xor ecx,ecx - ; sqr a[7]*a[1] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [24+esi] - adc ecx,0 - mov edx,DWORD [8+esi] - ; sqr a[6]*a[2] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [20+esi] - adc ecx,0 - mov edx,DWORD [12+esi] - ; sqr a[5]*a[3] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [16+esi] - adc ecx,0 - ; sqr a[4]*a[4] - mul eax - add ebp,eax - adc ebx,edx - mov edx,DWORD [8+esi] - adc ecx,0 - mov DWORD [32+edi],ebp - mov eax,DWORD [28+esi] - ; saved r[8] - ; ############### Calculate word 9 - xor ebp,ebp - ; sqr a[7]*a[2] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [24+esi] - adc ebp,0 - mov edx,DWORD [12+esi] - ; sqr a[6]*a[3] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [20+esi] - adc ebp,0 - mov edx,DWORD [16+esi] - ; sqr a[5]*a[4] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [28+esi] - adc ebp,0 - mov DWORD [36+edi],ebx - mov edx,DWORD [12+esi] - ; saved r[9] - ; ############### Calculate word 10 - xor ebx,ebx - ; sqr a[7]*a[3] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [24+esi] - adc ebx,0 - mov edx,DWORD [16+esi] - ; sqr a[6]*a[4] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [20+esi] - adc ebx,0 - ; sqr a[5]*a[5] - mul eax - add ecx,eax - adc ebp,edx - mov edx,DWORD [16+esi] - adc ebx,0 - mov DWORD [40+edi],ecx - mov eax,DWORD [28+esi] - ; saved r[10] - ; ############### Calculate word 11 - xor ecx,ecx - ; sqr a[7]*a[4] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [24+esi] - adc ecx,0 - mov edx,DWORD [20+esi] - ; sqr a[6]*a[5] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [28+esi] - adc ecx,0 - mov DWORD [44+edi],ebp - mov edx,DWORD [20+esi] - ; saved r[11] - ; ############### Calculate word 12 - xor ebp,ebp - ; sqr a[7]*a[5] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [24+esi] - adc ebp,0 - ; sqr a[6]*a[6] - mul eax - add ebx,eax - adc ecx,edx - mov edx,DWORD [24+esi] - adc ebp,0 - mov DWORD [48+edi],ebx - mov eax,DWORD [28+esi] - ; saved r[12] - ; ############### Calculate word 13 - xor ebx,ebx - ; sqr a[7]*a[6] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [28+esi] - adc ebx,0 - mov DWORD [52+edi],ecx - ; saved r[13] - ; ############### Calculate word 14 - xor ecx,ecx - ; sqr a[7]*a[7] - mul eax - add ebp,eax - adc ebx,edx - adc ecx,0 - mov DWORD [56+edi],ebp - ; saved r[14] - mov DWORD [60+edi],ebx - pop ebx - pop ebp - pop edi - pop esi - ret -global _bn_sqr_comba4 -align 16 -_bn_sqr_comba4: -L$_bn_sqr_comba4_begin: - push esi - push edi - push ebp - push ebx - mov edi,DWORD [20+esp] - mov esi,DWORD [24+esp] - xor ebx,ebx - xor ecx,ecx - mov eax,DWORD [esi] - ; ############### Calculate word 0 - xor ebp,ebp - ; sqr a[0]*a[0] - mul eax - add ebx,eax - adc ecx,edx - mov edx,DWORD [esi] - adc ebp,0 - mov DWORD [edi],ebx - mov eax,DWORD [4+esi] - ; saved r[0] - ; ############### Calculate word 1 - xor ebx,ebx - ; sqr a[1]*a[0] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [8+esi] - adc ebx,0 - mov DWORD [4+edi],ecx - mov edx,DWORD [esi] - ; saved r[1] - ; ############### Calculate word 2 - xor ecx,ecx - ; sqr a[2]*a[0] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [4+esi] - adc ecx,0 - ; sqr a[1]*a[1] - mul eax - add ebp,eax - adc ebx,edx - mov edx,DWORD [esi] - adc ecx,0 - mov DWORD [8+edi],ebp - mov eax,DWORD [12+esi] - ; saved r[2] - ; ############### Calculate word 3 - xor ebp,ebp - ; sqr a[3]*a[0] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [8+esi] - adc ebp,0 - mov edx,DWORD [4+esi] - ; sqr a[2]*a[1] - mul edx - add eax,eax - adc edx,edx - adc ebp,0 - add ebx,eax - adc ecx,edx - mov eax,DWORD [12+esi] - adc ebp,0 - mov DWORD [12+edi],ebx - mov edx,DWORD [4+esi] - ; saved r[3] - ; ############### Calculate word 4 - xor ebx,ebx - ; sqr a[3]*a[1] - mul edx - add eax,eax - adc edx,edx - adc ebx,0 - add ecx,eax - adc ebp,edx - mov eax,DWORD [8+esi] - adc ebx,0 - ; sqr a[2]*a[2] - mul eax - add ecx,eax - adc ebp,edx - mov edx,DWORD [8+esi] - adc ebx,0 - mov DWORD [16+edi],ecx - mov eax,DWORD [12+esi] - ; saved r[4] - ; ############### Calculate word 5 - xor ecx,ecx - ; sqr a[3]*a[2] - mul edx - add eax,eax - adc edx,edx - adc ecx,0 - add ebp,eax - adc ebx,edx - mov eax,DWORD [12+esi] - adc ecx,0 - mov DWORD [20+edi],ebp - ; saved r[5] - ; ############### Calculate word 6 - xor ebp,ebp - ; sqr a[3]*a[3] - mul eax - add ebx,eax - adc ecx,edx - adc ebp,0 - mov DWORD [24+edi],ebx - ; saved r[6] - mov DWORD [28+edi],ecx - pop ebx - pop ebp - pop edi - pop esi - ret -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/check.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/check.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/check.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/check.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/dh.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/dh.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/dh.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/dh.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digest.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digest.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digest.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digest.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c.inc similarity index 75% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c.inc index e685ac58..817f454d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c.inc @@ -59,8 +59,6 @@ #include #include -#include -#include #include #include @@ -75,69 +73,21 @@ #endif -static void md4_init(EVP_MD_CTX *ctx) { - CHECK(MD4_Init(ctx->md_data)); -} - -static void md4_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(MD4_Update(ctx->md_data, data, count)); -} - -static void md4_final(EVP_MD_CTX *ctx, uint8_t *out) { - CHECK(MD4_Final(out, ctx->md_data)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md4) { - out->type = NID_md4; - out->md_size = MD4_DIGEST_LENGTH; - out->flags = 0; - out->init = md4_init; - out->update = md4_update; - out->final = md4_final; - out->block_size = 64; - out->ctx_size = sizeof(MD4_CTX); -} - - -static void md5_init(EVP_MD_CTX *ctx) { - CHECK(MD5_Init(ctx->md_data)); -} - -static void md5_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(MD5_Update(ctx->md_data, data, count)); -} - -static void md5_final(EVP_MD_CTX *ctx, uint8_t *out) { - CHECK(MD5_Final(out, ctx->md_data)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md5) { - out->type = NID_md5; - out->md_size = MD5_DIGEST_LENGTH; - out->flags = 0; - out->init = md5_init; - out->update = md5_update; - out->final = md5_final; - out->block_size = 64; - out->ctx_size = sizeof(MD5_CTX); -} - - static void sha1_init(EVP_MD_CTX *ctx) { - CHECK(SHA1_Init(ctx->md_data)); + BCM_sha1_init(ctx->md_data); } static void sha1_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(SHA1_Update(ctx->md_data, data, count)); + BCM_sha1_update(ctx->md_data, data, count); } static void sha1_final(EVP_MD_CTX *ctx, uint8_t *md) { - CHECK(SHA1_Final(md, ctx->md_data)); + BCM_sha1_final(md, ctx->md_data); } DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha1) { out->type = NID_sha1; - out->md_size = SHA_DIGEST_LENGTH; + out->md_size = BCM_SHA_DIGEST_LENGTH; out->flags = 0; out->init = sha1_init; out->update = sha1_update; @@ -266,39 +216,4 @@ DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha512_256) { out->ctx_size = sizeof(SHA512_CTX); } - -typedef struct { - MD5_CTX md5; - SHA_CTX sha1; -} MD5_SHA1_CTX; - -static void md5_sha1_init(EVP_MD_CTX *md_ctx) { - MD5_SHA1_CTX *ctx = md_ctx->md_data; - CHECK(MD5_Init(&ctx->md5) && SHA1_Init(&ctx->sha1)); -} - -static void md5_sha1_update(EVP_MD_CTX *md_ctx, const void *data, - size_t count) { - MD5_SHA1_CTX *ctx = md_ctx->md_data; - CHECK(MD5_Update(&ctx->md5, data, count) && - SHA1_Update(&ctx->sha1, data, count)); -} - -static void md5_sha1_final(EVP_MD_CTX *md_ctx, uint8_t *out) { - MD5_SHA1_CTX *ctx = md_ctx->md_data; - CHECK(MD5_Final(out, &ctx->md5) && - SHA1_Final(out + MD5_DIGEST_LENGTH, &ctx->sha1)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md5_sha1) { - out->type = NID_md5_sha1; - out->md_size = MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH; - out->flags = 0; - out->init = md5_sha1_init; - out->update = md5_sha1_update; - out->final = md5_sha1_final; - out->block_size = 64; - out->ctx_size = sizeof(MD5_SHA1_CTX); -} - #undef CHECK diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/digestsign/digestsign.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/digestsign/digestsign.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/digestsign/digestsign.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/digestsign/digestsign.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c.inc similarity index 95% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c.inc index 57306177..249588fb 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c.inc @@ -75,10 +75,12 @@ #include #include #include +#include #include #include "internal.h" #include "../delocate.h" +#include "../ecdsa/internal.h" #include "../service_indicator/internal.h" #include "../../internal.h" @@ -242,7 +244,10 @@ int EC_KEY_set_private_key(EC_KEY *key, const BIGNUM *priv_key) { return 0; } if (!ec_bignum_to_scalar(key->group, &scalar->scalar, priv_key) || - ec_scalar_is_zero(key->group, &scalar->scalar)) { + // Zero is not a valid private key, so it is safe to leak the result of + // this comparison. + constant_time_declassify_int( + ec_scalar_is_zero(key->group, &scalar->scalar))) { OPENSSL_PUT_ERROR(EC, EC_R_INVALID_PRIVATE_KEY); ec_wrapped_scalar_free(scalar); return 0; @@ -341,15 +346,17 @@ int EC_KEY_check_fips(const EC_KEY *key) { } if (key->priv_key) { - uint8_t data[16] = {0}; - ECDSA_SIG *sig = ECDSA_do_sign(data, sizeof(data), key); + uint8_t digest[SHA256_DIGEST_LENGTH] = {0}; + uint8_t sig[ECDSA_MAX_FIXED_LEN]; + size_t sig_len; + if (!ecdsa_sign_fixed(digest, sizeof(digest), sig, &sig_len, sizeof(sig), + key)) { + goto end; + } if (boringssl_fips_break_test("ECDSA_PWCT")) { - data[0] = ~data[0]; + digest[0] = ~digest[0]; } - int ok = sig != NULL && - ECDSA_do_verify(data, sizeof(data), sig, key); - ECDSA_SIG_free(sig); - if (!ok) { + if (!ecdsa_verify_fixed(digest, sizeof(digest), sig, sig_len, key)) { OPENSSL_PUT_ERROR(EC, EC_R_PUBLIC_KEY_VALIDATION_FAILED); goto end; } @@ -518,6 +525,11 @@ int EC_KEY_generate_key(EC_KEY *key) { } int EC_KEY_generate_key_fips(EC_KEY *eckey) { + if (eckey == NULL || eckey->group == NULL) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + boringssl_ensure_ecc_self_test(); if (EC_KEY_generate_key(eckey) && EC_KEY_check_fips(eckey)) { diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_montgomery.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_montgomery.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_montgomery.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_montgomery.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/felem.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/felem.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/felem.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/felem.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/oct.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/oct.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/oct.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/oct.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p224-64.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p224-64.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p224-64.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p224-64.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c.inc similarity index 85% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c.inc index 7d63708a..00d65264 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c.inc @@ -39,7 +39,7 @@ typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; // One converted into the Montgomery domain -static const BN_ULONG ONE[P256_LIMBS] = { +static const BN_ULONG ONE_MONT[P256_LIMBS] = { TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000), TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe), }; @@ -116,6 +116,103 @@ static BN_ULONG is_not_zero(BN_ULONG in) { return in; } +#if defined(OPENSSL_X86_64) +// Dispatch between CPU variations. The "_adx" suffixed functions use MULX in +// addition to ADCX/ADOX. MULX is part of BMI2, not ADX, so we must check both +// capabilities. +static void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_mul_mont_adx(res, a, b); + } else { + ecp_nistz256_mul_mont_nohw(res, a, b); + } +} + +static void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS]) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_sqr_mont_adx(res, a); + } else { + ecp_nistz256_sqr_mont_nohw(res, a); + } +} + +static void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_ord_mul_mont_adx(res, a, b); + } else { + ecp_nistz256_ord_mul_mont_nohw(res, a, b); + } +} + +static void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + BN_ULONG rep) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_ord_sqr_mont_adx(res, a, rep); + } else { + ecp_nistz256_ord_sqr_mont_nohw(res, a, rep); + } +} + +static void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16], + int index) { + if (CRYPTO_is_AVX2_capable()) { + ecp_nistz256_select_w5_avx2(val, in_t, index); + } else { + ecp_nistz256_select_w5_nohw(val, in_t, index); + } +} + +static void ecp_nistz256_select_w7(P256_POINT_AFFINE *val, + const P256_POINT_AFFINE in_t[64], + int index) { + if (CRYPTO_is_AVX2_capable()) { + ecp_nistz256_select_w7_avx2(val, in_t, index); + } else { + ecp_nistz256_select_w7_nohw(val, in_t, index); + } +} + +static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_point_double_adx(r, a); + } else { + ecp_nistz256_point_double_nohw(r, a); + } +} + +static void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, + const P256_POINT *b) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_point_add_adx(r, a, b); + } else { + ecp_nistz256_point_add_nohw(r, a, b); + } +} + +static void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a, + const P256_POINT_AFFINE *b) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_point_add_affine_adx(r, a, b); + } else { + ecp_nistz256_point_add_affine_nohw(r, a, b); + } +} +#endif // OPENSSL_X86_64 + +// ecp_nistz256_from_mont sets |res| to |in|, converted from Montgomery domain +// by multiplying with 1. +static void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG in[P256_LIMBS]) { + static const BN_ULONG ONE[P256_LIMBS] = {1}; + ecp_nistz256_mul_mont(res, in, ONE); +} + // ecp_nistz256_mod_inverse_sqr_mont sets |r| to (|in| * 2^-256)^-2 * 2^256 mod // p. That is, |r| is the modular inverse square of |in| for input and output in // the Montgomery domain. @@ -328,12 +425,12 @@ static void ecp_nistz256_point_mul_base(const EC_GROUP *group, EC_JACOBIAN *r, copy_conditional(t.Y, p.Z, wvalue & 1); // Convert |t| from affine to Jacobian coordinates. We set Z to zero if |t| - // is infinity and |ONE| otherwise. |t| was computed from the table, so it - // is infinity iff |wvalue >> 1| is zero. + // is infinity and |ONE_MONT| otherwise. |t| was computed from the table, so + // it is infinity iff |wvalue >> 1| is zero. OPENSSL_memcpy(p.X, t.X, sizeof(p.X)); OPENSSL_memcpy(p.Y, t.Y, sizeof(p.Y)); OPENSSL_memset(p.Z, 0, sizeof(p.Z)); - copy_conditional(p.Z, ONE, is_not_zero(wvalue >> 1)); + copy_conditional(p.Z, ONE_MONT, is_not_zero(wvalue >> 1)); for (int i = 1; i < 37; i++) { wvalue = calc_wvalue(&index, p_str); @@ -372,14 +469,14 @@ static void ecp_nistz256_points_mul_public(const EC_GROUP *group, size_t wvalue = calc_first_wvalue(&index, p_str); // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p| - // is infinity and |ONE| otherwise. |p| was computed from the table, so it - // is infinity iff |wvalue >> 1| is zero. + // is infinity and |ONE_MONT| otherwise. |p| was computed from the table, so + // it is infinity iff |wvalue >> 1| is zero. if ((wvalue >> 1) != 0) { OPENSSL_memcpy(p.X, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].X, sizeof(p.X)); OPENSSL_memcpy(p.Y, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].Y, sizeof(p.Y)); - OPENSSL_memcpy(p.Z, ONE, sizeof(p.Z)); + OPENSSL_memcpy(p.Z, ONE_MONT, sizeof(p.Z)); } else { OPENSSL_memset(p.X, 0, sizeof(p.X)); OPENSSL_memset(p.Y, 0, sizeof(p.Y)); diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.h index 985f8e25..cbf1e056 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.h +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.h @@ -48,21 +48,29 @@ extern "C" { void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); // ecp_nistz256_mul_mont sets |res| to |a| * |b| * 2^-256 mod P. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_mul_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +void ecp_nistz256_mul_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +#else void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]); +#endif // ecp_nistz256_sqr_mont sets |res| to |a| * |a| * 2^-256 mod P. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_sqr_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS]); +void ecp_nistz256_sqr_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS]); +#else void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); - -// ecp_nistz256_from_mont sets |res| to |in|, converted from Montgomery domain -// by multiplying with 1. -static inline void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], - const BN_ULONG in[P256_LIMBS]) { - static const BN_ULONG ONE[P256_LIMBS] = { 1 }; - ecp_nistz256_mul_mont(res, in, ONE); -} +#endif // P-256 scalar operations. @@ -72,15 +80,31 @@ static inline void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], // ecp_nistz256_ord_mul_mont sets |res| to |a| * |b| where inputs and outputs // are in Montgomery form. That is, |res| is |a| * |b| * 2^-256 mod N. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_ord_mul_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +void ecp_nistz256_ord_mul_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +#else void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]); +#endif // ecp_nistz256_ord_sqr_mont sets |res| to |a|^(2*|rep|) where inputs and // outputs are in Montgomery form. That is, |res| is // (|a| * 2^-256)^(2*|rep|) * 2^256 mod N. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_ord_sqr_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], BN_ULONG rep); +void ecp_nistz256_ord_sqr_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], BN_ULONG rep); +#else void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], BN_ULONG rep); +#endif // beeu_mod_inverse_vartime sets out = a^-1 mod p using a Euclidean algorithm. // Assumption: 0 < a < p < 2^(256) and p is odd. @@ -111,27 +135,60 @@ typedef struct { // ecp_nistz256_select_w5 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 16 // and all zeros (the point at infinity) if |index| is 0. This is done in // constant time. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_select_w5_nohw(P256_POINT *val, const P256_POINT in_t[16], + int index); +void ecp_nistz256_select_w5_avx2(P256_POINT *val, const P256_POINT in_t[16], + int index); +#else void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16], int index); +#endif // ecp_nistz256_select_w7 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 64 // and all zeros (the point at infinity) if |index| is 0. This is done in // constant time. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_select_w7_nohw(P256_POINT_AFFINE *val, + const P256_POINT_AFFINE in_t[64], int index); +void ecp_nistz256_select_w7_avx2(P256_POINT_AFFINE *val, + const P256_POINT_AFFINE in_t[64], int index); +#else void ecp_nistz256_select_w7(P256_POINT_AFFINE *val, const P256_POINT_AFFINE in_t[64], int index); +#endif // ecp_nistz256_point_double sets |r| to |a| doubled. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_point_double_nohw(P256_POINT *r, const P256_POINT *a); +void ecp_nistz256_point_double_adx(P256_POINT *r, const P256_POINT *a); +#else void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a); +#endif // ecp_nistz256_point_add adds |a| to |b| and places the result in |r|. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_point_add_nohw(P256_POINT *r, const P256_POINT *a, + const P256_POINT *b); +void ecp_nistz256_point_add_adx(P256_POINT *r, const P256_POINT *a, + const P256_POINT *b); +#else void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, const P256_POINT *b); +#endif // ecp_nistz256_point_add_affine adds |a| to |b| and places the result in // |r|. |a| and |b| must not represent the same point unless they are both // infinity. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_point_add_affine_adx(P256_POINT *r, const P256_POINT *a, + const P256_POINT_AFFINE *b); +void ecp_nistz256_point_add_affine_nohw(P256_POINT *r, const P256_POINT *a, + const P256_POINT_AFFINE *b); +#else void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a, const P256_POINT_AFFINE *b); +#endif #endif /* !defined(OPENSSL_NO_ASM) && \ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c.inc similarity index 95% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c.inc index 0850c077..7721b488 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c.inc @@ -23,8 +23,12 @@ int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out, const BIGNUM *in) { + // Scalars, which are often secret, must be reduced modulo the order. Those + // that are not will be discarded, so leaking the result of the comparison is + // safe. if (!bn_copy_words(out->words, group->order.N.width, in) || - !bn_less_than_words(out->words, group->order.N.d, group->order.N.width)) { + !constant_time_declassify_int(bn_less_than_words( + out->words, group->order.N.d, group->order.N.width))) { OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR); return 0; } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple_mul.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple_mul.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple_mul.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple_mul.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/util.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/util.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/util.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/util.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/wnaf.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/wnaf.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/wnaf.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/wnaf.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdh/ecdh.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdh/ecdh.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdh/ecdh.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdh/ecdh.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c.inc similarity index 74% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c.inc index 053235bc..5065f2a8 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c.inc @@ -95,61 +95,9 @@ static void digest_to_scalar(const EC_GROUP *group, EC_SCALAR *out, order->width); } -ECDSA_SIG *ECDSA_SIG_new(void) { - ECDSA_SIG *sig = OPENSSL_malloc(sizeof(ECDSA_SIG)); - if (sig == NULL) { - return NULL; - } - sig->r = BN_new(); - sig->s = BN_new(); - if (sig->r == NULL || sig->s == NULL) { - ECDSA_SIG_free(sig); - return NULL; - } - return sig; -} - -void ECDSA_SIG_free(ECDSA_SIG *sig) { - if (sig == NULL) { - return; - } - - BN_free(sig->r); - BN_free(sig->s); - OPENSSL_free(sig); -} - -const BIGNUM *ECDSA_SIG_get0_r(const ECDSA_SIG *sig) { - return sig->r; -} - -const BIGNUM *ECDSA_SIG_get0_s(const ECDSA_SIG *sig) { - return sig->s; -} - -void ECDSA_SIG_get0(const ECDSA_SIG *sig, const BIGNUM **out_r, - const BIGNUM **out_s) { - if (out_r != NULL) { - *out_r = sig->r; - } - if (out_s != NULL) { - *out_s = sig->s; - } -} - -int ECDSA_SIG_set0(ECDSA_SIG *sig, BIGNUM *r, BIGNUM *s) { - if (r == NULL || s == NULL) { - return 0; - } - BN_free(sig->r); - BN_free(sig->s); - sig->r = r; - sig->s = s; - return 1; -} - -int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len, - const ECDSA_SIG *sig, const EC_KEY *eckey) { +int ecdsa_verify_fixed_no_self_test(const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, + const EC_KEY *eckey) { const EC_GROUP *group = EC_KEY_get0_group(eckey); const EC_POINT *pub_key = EC_KEY_get0_public_key(eckey); if (group == NULL || pub_key == NULL || sig == NULL) { @@ -157,11 +105,13 @@ int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len, return 0; } + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); EC_SCALAR r, s, u1, u2, s_inv_mont, m; - if (BN_is_zero(sig->r) || - !ec_bignum_to_scalar(group, &r, sig->r) || - BN_is_zero(sig->s) || - !ec_bignum_to_scalar(group, &s, sig->s)) { + if (sig_len != 2 * scalar_len || + !ec_scalar_from_bytes(group, &r, sig, scalar_len) || + ec_scalar_is_zero(group, &r) || + !ec_scalar_from_bytes(group, &s, sig + scalar_len, scalar_len) || + ec_scalar_is_zero(group, &s)) { OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); return 0; } @@ -195,24 +145,31 @@ int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len, return 1; } -int ECDSA_do_verify(const uint8_t *digest, size_t digest_len, - const ECDSA_SIG *sig, const EC_KEY *eckey) { +int ecdsa_verify_fixed(const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, const EC_KEY *key) { boringssl_ensure_ecc_self_test(); - return ecdsa_do_verify_no_self_test(digest, digest_len, sig, eckey); + return ecdsa_verify_fixed_no_self_test(digest, digest_len, sig, sig_len, key); } -static ECDSA_SIG *ecdsa_sign_impl(const EC_GROUP *group, int *out_retry, - const EC_SCALAR *priv_key, const EC_SCALAR *k, - const uint8_t *digest, size_t digest_len) { +static int ecdsa_sign_impl(const EC_GROUP *group, int *out_retry, uint8_t *sig, + size_t *out_sig_len, size_t max_sig_len, + const EC_SCALAR *priv_key, const EC_SCALAR *k, + const uint8_t *digest, size_t digest_len) { *out_retry = 0; // Check that the size of the group order is FIPS compliant (FIPS 186-4 // B.5.2). const BIGNUM *order = EC_GROUP_get0_order(group); if (BN_num_bits(order) < 160) { - OPENSSL_PUT_ERROR(ECDSA, EC_R_INVALID_GROUP_ORDER); - return NULL; + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER); + return 0; + } + + size_t sig_len = 2 * BN_num_bytes(order); + if (sig_len > max_sig_len) { + OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); + return 0; } // Compute r, the x-coordinate of k * generator. @@ -220,12 +177,12 @@ static ECDSA_SIG *ecdsa_sign_impl(const EC_GROUP *group, int *out_retry, EC_SCALAR r; if (!ec_point_mul_scalar_base(group, &tmp_point, k) || !ec_get_x_coordinate_as_scalar(group, &r, &tmp_point)) { - return NULL; + return 0; } if (constant_time_declassify_int(ec_scalar_is_zero(group, &r))) { *out_retry = 1; - return NULL; + return 0; } // s = priv_key * r. Note if only one parameter is in the Montgomery domain, @@ -252,71 +209,59 @@ static ECDSA_SIG *ecdsa_sign_impl(const EC_GROUP *group, int *out_retry, ec_scalar_mul_montgomery(group, &s, &s, &tmp); if (constant_time_declassify_int(ec_scalar_is_zero(group, &s))) { *out_retry = 1; - return NULL; + return 0; } CONSTTIME_DECLASSIFY(r.words, sizeof(r.words)); CONSTTIME_DECLASSIFY(s.words, sizeof(r.words)); - ECDSA_SIG *ret = ECDSA_SIG_new(); - if (ret == NULL || // - !bn_set_words(ret->r, r.words, order->width) || - !bn_set_words(ret->s, s.words, order->width)) { - ECDSA_SIG_free(ret); - return NULL; - } - return ret; + size_t len; + ec_scalar_to_bytes(group, sig, &len, &r); + assert(len == sig_len / 2); + ec_scalar_to_bytes(group, sig + len, &len, &s); + assert(len == sig_len / 2); + *out_sig_len = sig_len; + return 1; } -ECDSA_SIG *ecdsa_sign_with_nonce_for_known_answer_test(const uint8_t *digest, - size_t digest_len, - const EC_KEY *eckey, - const uint8_t *nonce, - size_t nonce_len) { +int ecdsa_sign_fixed_with_nonce_for_known_answer_test( + const uint8_t *digest, size_t digest_len, uint8_t *sig, size_t *out_sig_len, + size_t max_sig_len, const EC_KEY *eckey, const uint8_t *nonce, + size_t nonce_len) { if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) { OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_NOT_IMPLEMENTED); - return NULL; + return 0; } const EC_GROUP *group = EC_KEY_get0_group(eckey); if (group == NULL || eckey->priv_key == NULL) { OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER); - return NULL; + return 0; } const EC_SCALAR *priv_key = &eckey->priv_key->scalar; EC_SCALAR k; if (!ec_scalar_from_bytes(group, &k, nonce, nonce_len)) { - return NULL; + return 0; } int retry_ignored; - return ecdsa_sign_impl(group, &retry_ignored, priv_key, &k, digest, - digest_len); -} - -// This function is only exported for testing and is not called in production -// code. -ECDSA_SIG *ECDSA_sign_with_nonce_and_leak_private_key_for_testing( - const uint8_t *digest, size_t digest_len, const EC_KEY *eckey, - const uint8_t *nonce, size_t nonce_len) { - boringssl_ensure_ecc_self_test(); - - return ecdsa_sign_with_nonce_for_known_answer_test(digest, digest_len, eckey, - nonce, nonce_len); + return ecdsa_sign_impl(group, &retry_ignored, sig, out_sig_len, max_sig_len, + priv_key, &k, digest, digest_len); } -ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len, - const EC_KEY *eckey) { +int ecdsa_sign_fixed(const uint8_t *digest, size_t digest_len, uint8_t *sig, + size_t *out_sig_len, size_t max_sig_len, + const EC_KEY *eckey) { boringssl_ensure_ecc_self_test(); if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) { OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_NOT_IMPLEMENTED); - return NULL; + return 0; } const EC_GROUP *group = EC_KEY_get0_group(eckey); if (group == NULL || eckey->priv_key == NULL) { OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER); - return NULL; + return 0; } const BIGNUM *order = EC_GROUP_get0_order(group); const EC_SCALAR *priv_key = &eckey->priv_key->scalar; @@ -340,12 +285,11 @@ ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len, // FIPS) because the probability of requiring even one retry is negligible, // let alone 32. static const int kMaxIterations = 32; - ECDSA_SIG *ret = NULL; + int ret = 0; int iters = 0; for (;;) { EC_SCALAR k; if (!ec_random_nonzero_scalar(group, &k, additional_data)) { - ret = NULL; goto out; } @@ -354,8 +298,9 @@ ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len, CONSTTIME_SECRET(k.words, sizeof(k.words)); int retry; - ret = ecdsa_sign_impl(group, &retry, priv_key, &k, digest, digest_len); - if (ret != NULL || !retry) { + ret = ecdsa_sign_impl(group, &retry, sig, out_sig_len, max_sig_len, + priv_key, &k, digest, digest_len); + if (ret || !retry) { goto out; } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/internal.h index 0a8def26..519f6e18 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/internal.h @@ -17,25 +17,42 @@ #include +#include "../ec/internal.h" + #if defined(__cplusplus) extern "C" { #endif -// ecdsa_sign_with_nonce_for_known_answer_test behaves like |ECDSA_do_sign| but -// takes a fixed nonce. This function is used as part of known-answer tests in -// the FIPS module. -ECDSA_SIG *ecdsa_sign_with_nonce_for_known_answer_test(const uint8_t *digest, - size_t digest_len, - const EC_KEY *eckey, - const uint8_t *nonce, - size_t nonce_len); +// ECDSA_MAX_FIXED_LEN is the maximum length of an ECDSA signature in the +// fixed-width, big-endian format from IEEE P1363. +#define ECDSA_MAX_FIXED_LEN (2 * EC_MAX_BYTES) + +// ecdsa_sign_fixed behaves like |ECDSA_sign| but uses the fixed-width, +// big-endian format from IEEE P1363. +int ecdsa_sign_fixed(const uint8_t *digest, size_t digest_len, uint8_t *sig, + size_t *out_sig_len, size_t max_sig_len, + const EC_KEY *key); + +// ecdsa_sign_fixed_with_nonce_for_known_answer_test behaves like +// |ecdsa_sign_fixed| but takes a caller-supplied nonce. This function is used +// as part of known-answer tests in the FIPS module. +int ecdsa_sign_fixed_with_nonce_for_known_answer_test( + const uint8_t *digest, size_t digest_len, uint8_t *sig, size_t *out_sig_len, + size_t max_sig_len, const EC_KEY *key, const uint8_t *nonce, + size_t nonce_len); + +// ecdsa_verify_fixed behaves like |ECDSA_verify| but uses the fixed-width, +// big-endian format from IEEE P1363. +int ecdsa_verify_fixed(const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, const EC_KEY *key); -// ecdsa_do_verify_no_self_test does the same as |ECDSA_do_verify|, but doesn't +// ecdsa_verify_fixed_no_self_test behaves like ecdsa_verify_fixed, but doesn't // try to run the self-test first. This is for use in the self tests themselves, // to prevent an infinite loop. -int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len, - const ECDSA_SIG *sig, const EC_KEY *eckey); +int ecdsa_verify_fixed_no_self_test(const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, + const EC_KEY *key); #if defined(__cplusplus) diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/fips_shared_support.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/fips_shared_support.c index 2a66a1f0..74b35f01 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/fips_shared_support.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/fips_shared_support.c @@ -20,13 +20,10 @@ // that must be replaced with the real value during the build process. This // value need only be distinct, i.e. so that we can safely search-and-replace it // in an object file. -const uint8_t BORINGSSL_bcm_text_hash[64]; -const uint8_t BORINGSSL_bcm_text_hash[64] = { +const uint8_t BORINGSSL_bcm_text_hash[32]; +const uint8_t BORINGSSL_bcm_text_hash[32] = { 0xae, 0x2c, 0xea, 0x2a, 0xbd, 0xa6, 0xf3, 0xec, 0x97, 0x7f, 0x9b, 0xf6, 0x94, 0x9a, 0xfc, 0x83, 0x68, 0x27, 0xcb, 0xa0, 0xa0, 0x9f, - 0x6b, 0x6f, 0xde, 0x52, 0xcd, 0xe2, 0xcd, 0xff, 0x31, 0x80, 0xa2, - 0xd4, 0xc3, 0x66, 0x0f, 0xc2, 0x6a, 0x7b, 0xf4, 0xbe, 0x39, 0xa2, - 0xd7, 0x25, 0xdb, 0x21, 0x98, 0xe9, 0xd5, 0x53, 0xbf, 0x5c, 0x32, - 0x06, 0x83, 0x34, 0x0c, 0x65, 0x89, 0x52, 0xbd, 0x1f, + 0x6b, 0x6f, 0xde, 0x52, 0xcd, 0xe2, 0xcd, 0xff, 0x31, 0x80, }; #endif // FIPS && SHARED_LIBRARY diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-ios.ios.arm.S deleted file mode 100644 index d6ddf144..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-ios.ios.arm.S +++ /dev/null @@ -1,257 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__APPLE__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__) -#include - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL -@ instructions are in aesv8-armx.pl.) - - -.text -#if defined(__thumb2__) || defined(__clang__) -.syntax unified -#define ldrplb ldrbpl -#define ldrneb ldrbne -#endif -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif -#if __ARM_MAX_ARCH__>=7 - - - -.globl _gcm_init_neon -.private_extern _gcm_init_neon -#ifdef __thumb2__ -.thumb_func _gcm_init_neon -#endif -.align 4 -_gcm_init_neon: - vld1.64 d7,[r1]! @ load H - vmov.i8 q8,#0xe1 - vld1.64 d6,[r1] - vshl.i64 d17,#57 - vshr.u64 d16,#63 @ t0=0xc2....01 - vdup.8 q9,d7[7] - vshr.u64 d26,d6,#63 - vshr.s8 q9,#7 @ broadcast carry bit - vshl.i64 q3,q3,#1 - vand q8,q8,q9 - vorr d7,d26 @ H<<<=1 - veor q3,q3,q8 @ twisted H - vstmia r0,{q3} - - bx lr @ bx lr - - -.globl _gcm_gmult_neon -.private_extern _gcm_gmult_neon -#ifdef __thumb2__ -.thumb_func _gcm_gmult_neon -#endif -.align 4 -_gcm_gmult_neon: - vld1.64 d7,[r0]! @ load Xi - vld1.64 d6,[r0]! - vmov.i64 d29,#0x0000ffffffffffff - vldmia r1,{d26,d27} @ load twisted H - vmov.i64 d30,#0x00000000ffffffff -#ifdef __ARMEL__ - vrev64.8 q3,q3 -#endif - vmov.i64 d31,#0x000000000000ffff - veor d28,d26,d27 @ Karatsuba pre-processing - mov r3,#16 - b Lgmult_neon - - -.globl _gcm_ghash_neon -.private_extern _gcm_ghash_neon -#ifdef __thumb2__ -.thumb_func _gcm_ghash_neon -#endif -.align 4 -_gcm_ghash_neon: - vld1.64 d1,[r0]! @ load Xi - vld1.64 d0,[r0]! - vmov.i64 d29,#0x0000ffffffffffff - vldmia r1,{d26,d27} @ load twisted H - vmov.i64 d30,#0x00000000ffffffff -#ifdef __ARMEL__ - vrev64.8 q0,q0 -#endif - vmov.i64 d31,#0x000000000000ffff - veor d28,d26,d27 @ Karatsuba pre-processing - -Loop_neon: - vld1.64 d7,[r2]! @ load inp - vld1.64 d6,[r2]! -#ifdef __ARMEL__ - vrev64.8 q3,q3 -#endif - veor q3,q0 @ inp^=Xi -Lgmult_neon: - vext.8 d16, d26, d26, #1 @ A1 - vmull.p8 q8, d16, d6 @ F = A1*B - vext.8 d0, d6, d6, #1 @ B1 - vmull.p8 q0, d26, d0 @ E = A*B1 - vext.8 d18, d26, d26, #2 @ A2 - vmull.p8 q9, d18, d6 @ H = A2*B - vext.8 d22, d6, d6, #2 @ B2 - vmull.p8 q11, d26, d22 @ G = A*B2 - vext.8 d20, d26, d26, #3 @ A3 - veor q8, q8, q0 @ L = E + F - vmull.p8 q10, d20, d6 @ J = A3*B - vext.8 d0, d6, d6, #3 @ B3 - veor q9, q9, q11 @ M = G + H - vmull.p8 q0, d26, d0 @ I = A*B3 - veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 - vand d17, d17, d29 - vext.8 d22, d6, d6, #4 @ B4 - veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 - vand d19, d19, d30 - vmull.p8 q11, d26, d22 @ K = A*B4 - veor q10, q10, q0 @ N = I + J - veor d16, d16, d17 - veor d18, d18, d19 - veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 - vand d21, d21, d31 - vext.8 q8, q8, q8, #15 - veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 - vmov.i64 d23, #0 - vext.8 q9, q9, q9, #14 - veor d20, d20, d21 - vmull.p8 q0, d26, d6 @ D = A*B - vext.8 q11, q11, q11, #12 - vext.8 q10, q10, q10, #13 - veor q8, q8, q9 - veor q10, q10, q11 - veor q0, q0, q8 - veor q0, q0, q10 - veor d6,d6,d7 @ Karatsuba pre-processing - vext.8 d16, d28, d28, #1 @ A1 - vmull.p8 q8, d16, d6 @ F = A1*B - vext.8 d2, d6, d6, #1 @ B1 - vmull.p8 q1, d28, d2 @ E = A*B1 - vext.8 d18, d28, d28, #2 @ A2 - vmull.p8 q9, d18, d6 @ H = A2*B - vext.8 d22, d6, d6, #2 @ B2 - vmull.p8 q11, d28, d22 @ G = A*B2 - vext.8 d20, d28, d28, #3 @ A3 - veor q8, q8, q1 @ L = E + F - vmull.p8 q10, d20, d6 @ J = A3*B - vext.8 d2, d6, d6, #3 @ B3 - veor q9, q9, q11 @ M = G + H - vmull.p8 q1, d28, d2 @ I = A*B3 - veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 - vand d17, d17, d29 - vext.8 d22, d6, d6, #4 @ B4 - veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 - vand d19, d19, d30 - vmull.p8 q11, d28, d22 @ K = A*B4 - veor q10, q10, q1 @ N = I + J - veor d16, d16, d17 - veor d18, d18, d19 - veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 - vand d21, d21, d31 - vext.8 q8, q8, q8, #15 - veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 - vmov.i64 d23, #0 - vext.8 q9, q9, q9, #14 - veor d20, d20, d21 - vmull.p8 q1, d28, d6 @ D = A*B - vext.8 q11, q11, q11, #12 - vext.8 q10, q10, q10, #13 - veor q8, q8, q9 - veor q10, q10, q11 - veor q1, q1, q8 - veor q1, q1, q10 - vext.8 d16, d27, d27, #1 @ A1 - vmull.p8 q8, d16, d7 @ F = A1*B - vext.8 d4, d7, d7, #1 @ B1 - vmull.p8 q2, d27, d4 @ E = A*B1 - vext.8 d18, d27, d27, #2 @ A2 - vmull.p8 q9, d18, d7 @ H = A2*B - vext.8 d22, d7, d7, #2 @ B2 - vmull.p8 q11, d27, d22 @ G = A*B2 - vext.8 d20, d27, d27, #3 @ A3 - veor q8, q8, q2 @ L = E + F - vmull.p8 q10, d20, d7 @ J = A3*B - vext.8 d4, d7, d7, #3 @ B3 - veor q9, q9, q11 @ M = G + H - vmull.p8 q2, d27, d4 @ I = A*B3 - veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 - vand d17, d17, d29 - vext.8 d22, d7, d7, #4 @ B4 - veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 - vand d19, d19, d30 - vmull.p8 q11, d27, d22 @ K = A*B4 - veor q10, q10, q2 @ N = I + J - veor d16, d16, d17 - veor d18, d18, d19 - veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 - vand d21, d21, d31 - vext.8 q8, q8, q8, #15 - veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 - vmov.i64 d23, #0 - vext.8 q9, q9, q9, #14 - veor d20, d20, d21 - vmull.p8 q2, d27, d7 @ D = A*B - vext.8 q11, q11, q11, #12 - vext.8 q10, q10, q10, #13 - veor q8, q8, q9 - veor q10, q10, q11 - veor q2, q2, q8 - veor q2, q2, q10 - veor q1,q1,q0 @ Karatsuba post-processing - veor q1,q1,q2 - veor d1,d1,d2 - veor d4,d4,d3 @ Xh|Xl - 256-bit result - - @ equivalent of reduction_avx from ghash-x86_64.pl - vshl.i64 q9,q0,#57 @ 1st phase - vshl.i64 q10,q0,#62 - veor q10,q10,q9 @ - vshl.i64 q9,q0,#63 - veor q10, q10, q9 @ - veor d1,d1,d20 @ - veor d4,d4,d21 - - vshr.u64 q10,q0,#1 @ 2nd phase - veor q2,q2,q0 - veor q0,q0,q10 @ - vshr.u64 q10,q10,#6 - vshr.u64 q0,q0,#1 @ - veor q0,q0,q2 @ - veor q0,q0,q10 @ - - subs r3,#16 - bne Loop_neon - -#ifdef __ARMEL__ - vrev64.8 q0,q0 -#endif - sub r0,#16 - vst1.64 d1,[r0]! @ write out Xi - vst1.64 d0,[r0] - - bx lr @ bx lr - -#endif -.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__) -#endif // defined(__arm__) && defined(__APPLE__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-windows.windows.x86.S deleted file mode 100644 index 43d7bc66..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-windows.windows.x86.S +++ /dev/null @@ -1,304 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -global _gcm_gmult_ssse3 -align 16 -_gcm_gmult_ssse3: -L$_gcm_gmult_ssse3_begin: - push ebp - push ebx - push esi - push edi - mov edi,DWORD [20+esp] - mov esi,DWORD [24+esp] - movdqu xmm0,[edi] - call L$000pic_point -L$000pic_point: - pop eax - movdqa xmm7,[(L$reverse_bytes-L$000pic_point)+eax] - movdqa xmm2,[(L$low4_mask-L$000pic_point)+eax] -db 102,15,56,0,199 - movdqa xmm1,xmm2 - pandn xmm1,xmm0 - psrld xmm1,4 - pand xmm0,xmm2 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - mov eax,5 -L$001loop_row_1: - movdqa xmm4,[esi] - lea esi,[16+esi] - movdqa xmm6,xmm2 -db 102,15,58,15,243,1 - movdqa xmm3,xmm6 - psrldq xmm2,1 - movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 - pxor xmm2,xmm5 - movdqa xmm5,xmm4 - psllq xmm5,60 - movdqa xmm6,xmm5 - pslldq xmm6,8 - pxor xmm3,xmm6 - psrldq xmm5,8 - pxor xmm2,xmm5 - psrlq xmm4,4 - pxor xmm2,xmm4 - sub eax,1 - jnz NEAR L$001loop_row_1 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,5 - pxor xmm2,xmm3 - pxor xmm3,xmm3 - mov eax,5 -L$002loop_row_2: - movdqa xmm4,[esi] - lea esi,[16+esi] - movdqa xmm6,xmm2 -db 102,15,58,15,243,1 - movdqa xmm3,xmm6 - psrldq xmm2,1 - movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 - pxor xmm2,xmm5 - movdqa xmm5,xmm4 - psllq xmm5,60 - movdqa xmm6,xmm5 - pslldq xmm6,8 - pxor xmm3,xmm6 - psrldq xmm5,8 - pxor xmm2,xmm5 - psrlq xmm4,4 - pxor xmm2,xmm4 - sub eax,1 - jnz NEAR L$002loop_row_2 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,5 - pxor xmm2,xmm3 - pxor xmm3,xmm3 - mov eax,6 -L$003loop_row_3: - movdqa xmm4,[esi] - lea esi,[16+esi] - movdqa xmm6,xmm2 -db 102,15,58,15,243,1 - movdqa xmm3,xmm6 - psrldq xmm2,1 - movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 - pxor xmm2,xmm5 - movdqa xmm5,xmm4 - psllq xmm5,60 - movdqa xmm6,xmm5 - pslldq xmm6,8 - pxor xmm3,xmm6 - psrldq xmm5,8 - pxor xmm2,xmm5 - psrlq xmm4,4 - pxor xmm2,xmm4 - sub eax,1 - jnz NEAR L$003loop_row_3 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,5 - pxor xmm2,xmm3 - pxor xmm3,xmm3 -db 102,15,56,0,215 - movdqu [edi],xmm2 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - pxor xmm6,xmm6 - pop edi - pop esi - pop ebx - pop ebp - ret -global _gcm_ghash_ssse3 -align 16 -_gcm_ghash_ssse3: -L$_gcm_ghash_ssse3_begin: - push ebp - push ebx - push esi - push edi - mov edi,DWORD [20+esp] - mov esi,DWORD [24+esp] - mov edx,DWORD [28+esp] - mov ecx,DWORD [32+esp] - movdqu xmm0,[edi] - call L$004pic_point -L$004pic_point: - pop ebx - movdqa xmm7,[(L$reverse_bytes-L$004pic_point)+ebx] - and ecx,-16 -db 102,15,56,0,199 - pxor xmm3,xmm3 -L$005loop_ghash: - movdqa xmm2,[(L$low4_mask-L$004pic_point)+ebx] - movdqu xmm1,[edx] -db 102,15,56,0,207 - pxor xmm0,xmm1 - movdqa xmm1,xmm2 - pandn xmm1,xmm0 - psrld xmm1,4 - pand xmm0,xmm2 - pxor xmm2,xmm2 - mov eax,5 -L$006loop_row_4: - movdqa xmm4,[esi] - lea esi,[16+esi] - movdqa xmm6,xmm2 -db 102,15,58,15,243,1 - movdqa xmm3,xmm6 - psrldq xmm2,1 - movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 - pxor xmm2,xmm5 - movdqa xmm5,xmm4 - psllq xmm5,60 - movdqa xmm6,xmm5 - pslldq xmm6,8 - pxor xmm3,xmm6 - psrldq xmm5,8 - pxor xmm2,xmm5 - psrlq xmm4,4 - pxor xmm2,xmm4 - sub eax,1 - jnz NEAR L$006loop_row_4 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,5 - pxor xmm2,xmm3 - pxor xmm3,xmm3 - mov eax,5 -L$007loop_row_5: - movdqa xmm4,[esi] - lea esi,[16+esi] - movdqa xmm6,xmm2 -db 102,15,58,15,243,1 - movdqa xmm3,xmm6 - psrldq xmm2,1 - movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 - pxor xmm2,xmm5 - movdqa xmm5,xmm4 - psllq xmm5,60 - movdqa xmm6,xmm5 - pslldq xmm6,8 - pxor xmm3,xmm6 - psrldq xmm5,8 - pxor xmm2,xmm5 - psrlq xmm4,4 - pxor xmm2,xmm4 - sub eax,1 - jnz NEAR L$007loop_row_5 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,5 - pxor xmm2,xmm3 - pxor xmm3,xmm3 - mov eax,6 -L$008loop_row_6: - movdqa xmm4,[esi] - lea esi,[16+esi] - movdqa xmm6,xmm2 -db 102,15,58,15,243,1 - movdqa xmm3,xmm6 - psrldq xmm2,1 - movdqa xmm5,xmm4 -db 102,15,56,0,224 -db 102,15,56,0,233 - pxor xmm2,xmm5 - movdqa xmm5,xmm4 - psllq xmm5,60 - movdqa xmm6,xmm5 - pslldq xmm6,8 - pxor xmm3,xmm6 - psrldq xmm5,8 - pxor xmm2,xmm5 - psrlq xmm4,4 - pxor xmm2,xmm4 - sub eax,1 - jnz NEAR L$008loop_row_6 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,1 - pxor xmm2,xmm3 - psrlq xmm3,5 - pxor xmm2,xmm3 - pxor xmm3,xmm3 - movdqa xmm0,xmm2 - lea esi,[esi-256] - lea edx,[16+edx] - sub ecx,16 - jnz NEAR L$005loop_ghash -db 102,15,56,0,199 - movdqu [edi],xmm0 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - pxor xmm6,xmm6 - pop edi - pop esi - pop ebx - pop ebp - ret -align 16 -L$reverse_bytes: -db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -align 16 -L$low4_mask: -dd 252645135,252645135,252645135,252645135 -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-windows.windows.x86.S deleted file mode 100644 index a47ab266..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-windows.windows.x86.S +++ /dev/null @@ -1,337 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -global _gcm_init_clmul -align 16 -_gcm_init_clmul: -L$_gcm_init_clmul_begin: - mov edx,DWORD [4+esp] - mov eax,DWORD [8+esp] - call L$000pic -L$000pic: - pop ecx - lea ecx,[(L$bswap-L$000pic)+ecx] - movdqu xmm2,[eax] - pshufd xmm2,xmm2,78 - pshufd xmm4,xmm2,255 - movdqa xmm3,xmm2 - psllq xmm2,1 - pxor xmm5,xmm5 - psrlq xmm3,63 - pcmpgtd xmm5,xmm4 - pslldq xmm3,8 - por xmm2,xmm3 - pand xmm5,[16+ecx] - pxor xmm2,xmm5 - movdqa xmm0,xmm2 - movdqa xmm1,xmm0 - pshufd xmm3,xmm0,78 - pshufd xmm4,xmm2,78 - pxor xmm3,xmm0 - pxor xmm4,xmm2 -db 102,15,58,68,194,0 -db 102,15,58,68,202,17 -db 102,15,58,68,220,0 - xorps xmm3,xmm0 - xorps xmm3,xmm1 - movdqa xmm4,xmm3 - psrldq xmm3,8 - pslldq xmm4,8 - pxor xmm1,xmm3 - pxor xmm0,xmm4 - movdqa xmm4,xmm0 - movdqa xmm3,xmm0 - psllq xmm0,5 - pxor xmm3,xmm0 - psllq xmm0,1 - pxor xmm0,xmm3 - psllq xmm0,57 - movdqa xmm3,xmm0 - pslldq xmm0,8 - psrldq xmm3,8 - pxor xmm0,xmm4 - pxor xmm1,xmm3 - movdqa xmm4,xmm0 - psrlq xmm0,1 - pxor xmm1,xmm4 - pxor xmm4,xmm0 - psrlq xmm0,5 - pxor xmm0,xmm4 - psrlq xmm0,1 - pxor xmm0,xmm1 - pshufd xmm3,xmm2,78 - pshufd xmm4,xmm0,78 - pxor xmm3,xmm2 - movdqu [edx],xmm2 - pxor xmm4,xmm0 - movdqu [16+edx],xmm0 -db 102,15,58,15,227,8 - movdqu [32+edx],xmm4 - ret -global _gcm_gmult_clmul -align 16 -_gcm_gmult_clmul: -L$_gcm_gmult_clmul_begin: - mov eax,DWORD [4+esp] - mov edx,DWORD [8+esp] - call L$001pic -L$001pic: - pop ecx - lea ecx,[(L$bswap-L$001pic)+ecx] - movdqu xmm0,[eax] - movdqa xmm5,[ecx] - movups xmm2,[edx] -db 102,15,56,0,197 - movups xmm4,[32+edx] - movdqa xmm1,xmm0 - pshufd xmm3,xmm0,78 - pxor xmm3,xmm0 -db 102,15,58,68,194,0 -db 102,15,58,68,202,17 -db 102,15,58,68,220,0 - xorps xmm3,xmm0 - xorps xmm3,xmm1 - movdqa xmm4,xmm3 - psrldq xmm3,8 - pslldq xmm4,8 - pxor xmm1,xmm3 - pxor xmm0,xmm4 - movdqa xmm4,xmm0 - movdqa xmm3,xmm0 - psllq xmm0,5 - pxor xmm3,xmm0 - psllq xmm0,1 - pxor xmm0,xmm3 - psllq xmm0,57 - movdqa xmm3,xmm0 - pslldq xmm0,8 - psrldq xmm3,8 - pxor xmm0,xmm4 - pxor xmm1,xmm3 - movdqa xmm4,xmm0 - psrlq xmm0,1 - pxor xmm1,xmm4 - pxor xmm4,xmm0 - psrlq xmm0,5 - pxor xmm0,xmm4 - psrlq xmm0,1 - pxor xmm0,xmm1 -db 102,15,56,0,197 - movdqu [eax],xmm0 - ret -global _gcm_ghash_clmul -align 16 -_gcm_ghash_clmul: -L$_gcm_ghash_clmul_begin: - push ebp - push ebx - push esi - push edi - mov eax,DWORD [20+esp] - mov edx,DWORD [24+esp] - mov esi,DWORD [28+esp] - mov ebx,DWORD [32+esp] - call L$002pic -L$002pic: - pop ecx - lea ecx,[(L$bswap-L$002pic)+ecx] - movdqu xmm0,[eax] - movdqa xmm5,[ecx] - movdqu xmm2,[edx] -db 102,15,56,0,197 - sub ebx,16 - jz NEAR L$003odd_tail - movdqu xmm3,[esi] - movdqu xmm6,[16+esi] -db 102,15,56,0,221 -db 102,15,56,0,245 - movdqu xmm5,[32+edx] - pxor xmm0,xmm3 - pshufd xmm3,xmm6,78 - movdqa xmm7,xmm6 - pxor xmm3,xmm6 - lea esi,[32+esi] -db 102,15,58,68,242,0 -db 102,15,58,68,250,17 -db 102,15,58,68,221,0 - movups xmm2,[16+edx] - nop - sub ebx,32 - jbe NEAR L$004even_tail - jmp NEAR L$005mod_loop -align 32 -L$005mod_loop: - pshufd xmm4,xmm0,78 - movdqa xmm1,xmm0 - pxor xmm4,xmm0 - nop -db 102,15,58,68,194,0 -db 102,15,58,68,202,17 -db 102,15,58,68,229,16 - movups xmm2,[edx] - xorps xmm0,xmm6 - movdqa xmm5,[ecx] - xorps xmm1,xmm7 - movdqu xmm7,[esi] - pxor xmm3,xmm0 - movdqu xmm6,[16+esi] - pxor xmm3,xmm1 -db 102,15,56,0,253 - pxor xmm4,xmm3 - movdqa xmm3,xmm4 - psrldq xmm4,8 - pslldq xmm3,8 - pxor xmm1,xmm4 - pxor xmm0,xmm3 -db 102,15,56,0,245 - pxor xmm1,xmm7 - movdqa xmm7,xmm6 - movdqa xmm4,xmm0 - movdqa xmm3,xmm0 - psllq xmm0,5 - pxor xmm3,xmm0 - psllq xmm0,1 - pxor xmm0,xmm3 -db 102,15,58,68,242,0 - movups xmm5,[32+edx] - psllq xmm0,57 - movdqa xmm3,xmm0 - pslldq xmm0,8 - psrldq xmm3,8 - pxor xmm0,xmm4 - pxor xmm1,xmm3 - pshufd xmm3,xmm7,78 - movdqa xmm4,xmm0 - psrlq xmm0,1 - pxor xmm3,xmm7 - pxor xmm1,xmm4 -db 102,15,58,68,250,17 - movups xmm2,[16+edx] - pxor xmm4,xmm0 - psrlq xmm0,5 - pxor xmm0,xmm4 - psrlq xmm0,1 - pxor xmm0,xmm1 -db 102,15,58,68,221,0 - lea esi,[32+esi] - sub ebx,32 - ja NEAR L$005mod_loop -L$004even_tail: - pshufd xmm4,xmm0,78 - movdqa xmm1,xmm0 - pxor xmm4,xmm0 -db 102,15,58,68,194,0 -db 102,15,58,68,202,17 -db 102,15,58,68,229,16 - movdqa xmm5,[ecx] - xorps xmm0,xmm6 - xorps xmm1,xmm7 - pxor xmm3,xmm0 - pxor xmm3,xmm1 - pxor xmm4,xmm3 - movdqa xmm3,xmm4 - psrldq xmm4,8 - pslldq xmm3,8 - pxor xmm1,xmm4 - pxor xmm0,xmm3 - movdqa xmm4,xmm0 - movdqa xmm3,xmm0 - psllq xmm0,5 - pxor xmm3,xmm0 - psllq xmm0,1 - pxor xmm0,xmm3 - psllq xmm0,57 - movdqa xmm3,xmm0 - pslldq xmm0,8 - psrldq xmm3,8 - pxor xmm0,xmm4 - pxor xmm1,xmm3 - movdqa xmm4,xmm0 - psrlq xmm0,1 - pxor xmm1,xmm4 - pxor xmm4,xmm0 - psrlq xmm0,5 - pxor xmm0,xmm4 - psrlq xmm0,1 - pxor xmm0,xmm1 - test ebx,ebx - jnz NEAR L$006done - movups xmm2,[edx] -L$003odd_tail: - movdqu xmm3,[esi] -db 102,15,56,0,221 - pxor xmm0,xmm3 - movdqa xmm1,xmm0 - pshufd xmm3,xmm0,78 - pshufd xmm4,xmm2,78 - pxor xmm3,xmm0 - pxor xmm4,xmm2 -db 102,15,58,68,194,0 -db 102,15,58,68,202,17 -db 102,15,58,68,220,0 - xorps xmm3,xmm0 - xorps xmm3,xmm1 - movdqa xmm4,xmm3 - psrldq xmm3,8 - pslldq xmm4,8 - pxor xmm1,xmm3 - pxor xmm0,xmm4 - movdqa xmm4,xmm0 - movdqa xmm3,xmm0 - psllq xmm0,5 - pxor xmm3,xmm0 - psllq xmm0,1 - pxor xmm0,xmm3 - psllq xmm0,57 - movdqa xmm3,xmm0 - pslldq xmm0,8 - psrldq xmm3,8 - pxor xmm0,xmm4 - pxor xmm1,xmm3 - movdqa xmm4,xmm0 - psrlq xmm0,1 - pxor xmm1,xmm4 - pxor xmm4,xmm0 - psrlq xmm0,5 - pxor xmm0,xmm4 - psrlq xmm0,1 - pxor xmm0,xmm1 -L$006done: -db 102,15,56,0,197 - movdqu [eax],xmm0 - pop edi - pop esi - pop ebx - pop ebp - ret -align 64 -L$bswap: -db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -db 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 -db 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 -db 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 -db 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 -db 0 -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-ios.ios.arm.S deleted file mode 100644 index 2c9d309e..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-ios.ios.arm.S +++ /dev/null @@ -1,259 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__APPLE__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__) -#include - -#if __ARM_MAX_ARCH__>=7 -.text - -.code 32 -#undef __thumb2__ -.globl _gcm_init_v8 -.private_extern _gcm_init_v8 -#ifdef __thumb2__ -.thumb_func _gcm_init_v8 -#endif -.align 4 -_gcm_init_v8: - AARCH64_VALID_CALL_TARGET - vld1.64 {q9},[r1] @ load input H - vmov.i8 q11,#0xe1 - vshl.i64 q11,q11,#57 @ 0xc2.0 - vext.8 q3,q9,q9,#8 - vshr.u64 q10,q11,#63 - vdup.32 q9,d18[1] - vext.8 q8,q10,q11,#8 @ t0=0xc2....01 - vshr.u64 q10,q3,#63 - vshr.s32 q9,q9,#31 @ broadcast carry bit - vand q10,q10,q8 - vshl.i64 q3,q3,#1 - vext.8 q10,q10,q10,#8 - vand q8,q8,q9 - vorr q3,q3,q10 @ H<<<=1 - veor q12,q3,q8 @ twisted H - vst1.64 {q12},[r0]! @ store Htable[0] - - @ calculate H^2 - vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing -.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 - veor q8,q8,q12 -.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 -.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase - - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - veor q0,q1,q10 - - vext.8 q10,q0,q0,#8 @ 2nd phase -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q10,q10,q2 - veor q14,q0,q10 - - vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing - veor q9,q9,q14 - vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed - vst1.64 {q13,q14},[r0]! @ store Htable[1..2] - bx lr - -.globl _gcm_gmult_v8 -.private_extern _gcm_gmult_v8 -#ifdef __thumb2__ -.thumb_func _gcm_gmult_v8 -#endif -.align 4 -_gcm_gmult_v8: - AARCH64_VALID_CALL_TARGET - vld1.64 {q9},[r0] @ load Xi - vmov.i8 q11,#0xe1 - vld1.64 {q12,q13},[r1] @ load twisted H, ... - vshl.u64 q11,q11,#57 -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif - vext.8 q3,q9,q9,#8 - -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo - veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction - - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - veor q0,q1,q10 - - vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q10,q10,q2 - veor q0,q0,q10 - -#ifndef __ARMEB__ - vrev64.8 q0,q0 -#endif - vext.8 q0,q0,q0,#8 - vst1.64 {q0},[r0] @ write out Xi - - bx lr - -.globl _gcm_ghash_v8 -.private_extern _gcm_ghash_v8 -#ifdef __thumb2__ -.thumb_func _gcm_ghash_v8 -#endif -.align 4 -_gcm_ghash_v8: - AARCH64_VALID_CALL_TARGET - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so - vld1.64 {q0},[r0] @ load [rotated] Xi - @ "[rotated]" means that - @ loaded value would have - @ to be rotated in order to - @ make it appear as in - @ algorithm specification - subs r3,r3,#32 @ see if r3 is 32 or larger - mov r12,#16 @ r12 is used as post- - @ increment for input pointer; - @ as loop is modulo-scheduled - @ r12 is zeroed just in time - @ to preclude overstepping - @ inp[len], which means that - @ last block[s] are actually - @ loaded twice, but last - @ copy is not processed - vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 - vmov.i8 q11,#0xe1 - vld1.64 {q14},[r1] - moveq r12,#0 @ is it time to zero r12? - vext.8 q0,q0,q0,#8 @ rotate Xi - vld1.64 {q8},[r2]! @ load [rotated] I[0] - vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant -#ifndef __ARMEB__ - vrev64.8 q8,q8 - vrev64.8 q0,q0 -#endif - vext.8 q3,q8,q8,#8 @ rotate I[0] - blo Lodd_tail_v8 @ r3 was less than 32 - vld1.64 {q9},[r2],r12 @ load [rotated] I[1] -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif - vext.8 q7,q9,q9,#8 - veor q3,q3,q0 @ I[i]^=Xi -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 - veor q9,q9,q7 @ Karatsuba pre-processing -.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 - b Loop_mod2x_v8 - -.align 4 -Loop_mod2x_v8: - vext.8 q10,q3,q3,#8 - subs r3,r3,#32 @ is there more data? -.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo - movlo r12,#0 @ is it time to zero r12? - -.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 - veor q10,q10,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi - veor q0,q0,q4 @ accumulate -.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) - vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] - - veor q2,q2,q6 - moveq r12,#0 @ is it time to zero r12? - veor q1,q1,q5 - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] -#ifndef __ARMEB__ - vrev64.8 q8,q8 -#endif - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction - -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - vext.8 q7,q9,q9,#8 - vext.8 q3,q8,q8,#8 - veor q0,q1,q10 -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 - veor q3,q3,q2 @ accumulate q3 early - - vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q3,q3,q10 - veor q9,q9,q7 @ Karatsuba pre-processing - veor q3,q3,q0 -.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 - bhs Loop_mod2x_v8 @ there was at least 32 more bytes - - veor q2,q2,q10 - vext.8 q3,q8,q8,#8 @ re-construct q3 - adds r3,r3,#32 @ re-construct r3 - veor q0,q0,q2 @ re-construct q0 - beq Ldone_v8 @ is r3 zero? -Lodd_tail_v8: - vext.8 q10,q0,q0,#8 - veor q3,q3,q0 @ inp^=Xi - veor q9,q8,q10 @ q9 is rotated inp^Xi - -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo - veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction - - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - veor q0,q1,q10 - - vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q10,q10,q2 - veor q0,q0,q10 - -Ldone_v8: -#ifndef __ARMEB__ - vrev64.8 q0,q0 -#endif - vext.8 q0,q0,q0,#8 - vst1.64 {q0},[r0] @ write out Xi - - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so - bx lr - -.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif -#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__) -#endif // defined(__arm__) && defined(__APPLE__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/hkdf/hkdf.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/hkdf/hkdf.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/hkdf/hkdf.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/hkdf/hkdf.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/hmac/hmac.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/hmac/hmac.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/hmac/hmac.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/hmac/hmac.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-windows.windows.x86.S deleted file mode 100644 index e6812534..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-windows.windows.x86.S +++ /dev/null @@ -1,701 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -global _md5_block_asm_data_order -align 16 -_md5_block_asm_data_order: -L$_md5_block_asm_data_order_begin: - push esi - push edi - mov edi,DWORD [12+esp] - mov esi,DWORD [16+esp] - mov ecx,DWORD [20+esp] - push ebp - shl ecx,6 - push ebx - add ecx,esi - sub ecx,64 - mov eax,DWORD [edi] - push ecx - mov ebx,DWORD [4+edi] - mov ecx,DWORD [8+edi] - mov edx,DWORD [12+edi] -L$000start: - ; - ; R0 section - mov edi,ecx - mov ebp,DWORD [esi] - ; R0 0 - xor edi,edx - and edi,ebx - lea eax,[3614090360+ebp*1+eax] - xor edi,edx - add eax,edi - mov edi,ebx - rol eax,7 - mov ebp,DWORD [4+esi] - add eax,ebx - ; R0 1 - xor edi,ecx - and edi,eax - lea edx,[3905402710+ebp*1+edx] - xor edi,ecx - add edx,edi - mov edi,eax - rol edx,12 - mov ebp,DWORD [8+esi] - add edx,eax - ; R0 2 - xor edi,ebx - and edi,edx - lea ecx,[606105819+ebp*1+ecx] - xor edi,ebx - add ecx,edi - mov edi,edx - rol ecx,17 - mov ebp,DWORD [12+esi] - add ecx,edx - ; R0 3 - xor edi,eax - and edi,ecx - lea ebx,[3250441966+ebp*1+ebx] - xor edi,eax - add ebx,edi - mov edi,ecx - rol ebx,22 - mov ebp,DWORD [16+esi] - add ebx,ecx - ; R0 4 - xor edi,edx - and edi,ebx - lea eax,[4118548399+ebp*1+eax] - xor edi,edx - add eax,edi - mov edi,ebx - rol eax,7 - mov ebp,DWORD [20+esi] - add eax,ebx - ; R0 5 - xor edi,ecx - and edi,eax - lea edx,[1200080426+ebp*1+edx] - xor edi,ecx - add edx,edi - mov edi,eax - rol edx,12 - mov ebp,DWORD [24+esi] - add edx,eax - ; R0 6 - xor edi,ebx - and edi,edx - lea ecx,[2821735955+ebp*1+ecx] - xor edi,ebx - add ecx,edi - mov edi,edx - rol ecx,17 - mov ebp,DWORD [28+esi] - add ecx,edx - ; R0 7 - xor edi,eax - and edi,ecx - lea ebx,[4249261313+ebp*1+ebx] - xor edi,eax - add ebx,edi - mov edi,ecx - rol ebx,22 - mov ebp,DWORD [32+esi] - add ebx,ecx - ; R0 8 - xor edi,edx - and edi,ebx - lea eax,[1770035416+ebp*1+eax] - xor edi,edx - add eax,edi - mov edi,ebx - rol eax,7 - mov ebp,DWORD [36+esi] - add eax,ebx - ; R0 9 - xor edi,ecx - and edi,eax - lea edx,[2336552879+ebp*1+edx] - xor edi,ecx - add edx,edi - mov edi,eax - rol edx,12 - mov ebp,DWORD [40+esi] - add edx,eax - ; R0 10 - xor edi,ebx - and edi,edx - lea ecx,[4294925233+ebp*1+ecx] - xor edi,ebx - add ecx,edi - mov edi,edx - rol ecx,17 - mov ebp,DWORD [44+esi] - add ecx,edx - ; R0 11 - xor edi,eax - and edi,ecx - lea ebx,[2304563134+ebp*1+ebx] - xor edi,eax - add ebx,edi - mov edi,ecx - rol ebx,22 - mov ebp,DWORD [48+esi] - add ebx,ecx - ; R0 12 - xor edi,edx - and edi,ebx - lea eax,[1804603682+ebp*1+eax] - xor edi,edx - add eax,edi - mov edi,ebx - rol eax,7 - mov ebp,DWORD [52+esi] - add eax,ebx - ; R0 13 - xor edi,ecx - and edi,eax - lea edx,[4254626195+ebp*1+edx] - xor edi,ecx - add edx,edi - mov edi,eax - rol edx,12 - mov ebp,DWORD [56+esi] - add edx,eax - ; R0 14 - xor edi,ebx - and edi,edx - lea ecx,[2792965006+ebp*1+ecx] - xor edi,ebx - add ecx,edi - mov edi,edx - rol ecx,17 - mov ebp,DWORD [60+esi] - add ecx,edx - ; R0 15 - xor edi,eax - and edi,ecx - lea ebx,[1236535329+ebp*1+ebx] - xor edi,eax - add ebx,edi - mov edi,ecx - rol ebx,22 - mov ebp,DWORD [4+esi] - add ebx,ecx - ; - ; R1 section - ; R1 16 - lea eax,[4129170786+ebp*1+eax] - xor edi,ebx - and edi,edx - mov ebp,DWORD [24+esi] - xor edi,ecx - add eax,edi - mov edi,ebx - rol eax,5 - add eax,ebx - ; R1 17 - lea edx,[3225465664+ebp*1+edx] - xor edi,eax - and edi,ecx - mov ebp,DWORD [44+esi] - xor edi,ebx - add edx,edi - mov edi,eax - rol edx,9 - add edx,eax - ; R1 18 - lea ecx,[643717713+ebp*1+ecx] - xor edi,edx - and edi,ebx - mov ebp,DWORD [esi] - xor edi,eax - add ecx,edi - mov edi,edx - rol ecx,14 - add ecx,edx - ; R1 19 - lea ebx,[3921069994+ebp*1+ebx] - xor edi,ecx - and edi,eax - mov ebp,DWORD [20+esi] - xor edi,edx - add ebx,edi - mov edi,ecx - rol ebx,20 - add ebx,ecx - ; R1 20 - lea eax,[3593408605+ebp*1+eax] - xor edi,ebx - and edi,edx - mov ebp,DWORD [40+esi] - xor edi,ecx - add eax,edi - mov edi,ebx - rol eax,5 - add eax,ebx - ; R1 21 - lea edx,[38016083+ebp*1+edx] - xor edi,eax - and edi,ecx - mov ebp,DWORD [60+esi] - xor edi,ebx - add edx,edi - mov edi,eax - rol edx,9 - add edx,eax - ; R1 22 - lea ecx,[3634488961+ebp*1+ecx] - xor edi,edx - and edi,ebx - mov ebp,DWORD [16+esi] - xor edi,eax - add ecx,edi - mov edi,edx - rol ecx,14 - add ecx,edx - ; R1 23 - lea ebx,[3889429448+ebp*1+ebx] - xor edi,ecx - and edi,eax - mov ebp,DWORD [36+esi] - xor edi,edx - add ebx,edi - mov edi,ecx - rol ebx,20 - add ebx,ecx - ; R1 24 - lea eax,[568446438+ebp*1+eax] - xor edi,ebx - and edi,edx - mov ebp,DWORD [56+esi] - xor edi,ecx - add eax,edi - mov edi,ebx - rol eax,5 - add eax,ebx - ; R1 25 - lea edx,[3275163606+ebp*1+edx] - xor edi,eax - and edi,ecx - mov ebp,DWORD [12+esi] - xor edi,ebx - add edx,edi - mov edi,eax - rol edx,9 - add edx,eax - ; R1 26 - lea ecx,[4107603335+ebp*1+ecx] - xor edi,edx - and edi,ebx - mov ebp,DWORD [32+esi] - xor edi,eax - add ecx,edi - mov edi,edx - rol ecx,14 - add ecx,edx - ; R1 27 - lea ebx,[1163531501+ebp*1+ebx] - xor edi,ecx - and edi,eax - mov ebp,DWORD [52+esi] - xor edi,edx - add ebx,edi - mov edi,ecx - rol ebx,20 - add ebx,ecx - ; R1 28 - lea eax,[2850285829+ebp*1+eax] - xor edi,ebx - and edi,edx - mov ebp,DWORD [8+esi] - xor edi,ecx - add eax,edi - mov edi,ebx - rol eax,5 - add eax,ebx - ; R1 29 - lea edx,[4243563512+ebp*1+edx] - xor edi,eax - and edi,ecx - mov ebp,DWORD [28+esi] - xor edi,ebx - add edx,edi - mov edi,eax - rol edx,9 - add edx,eax - ; R1 30 - lea ecx,[1735328473+ebp*1+ecx] - xor edi,edx - and edi,ebx - mov ebp,DWORD [48+esi] - xor edi,eax - add ecx,edi - mov edi,edx - rol ecx,14 - add ecx,edx - ; R1 31 - lea ebx,[2368359562+ebp*1+ebx] - xor edi,ecx - and edi,eax - mov ebp,DWORD [20+esi] - xor edi,edx - add ebx,edi - mov edi,ecx - rol ebx,20 - add ebx,ecx - ; - ; R2 section - ; R2 32 - xor edi,edx - xor edi,ebx - lea eax,[4294588738+ebp*1+eax] - add eax,edi - rol eax,4 - mov ebp,DWORD [32+esi] - mov edi,ebx - ; R2 33 - lea edx,[2272392833+ebp*1+edx] - add eax,ebx - xor edi,ecx - xor edi,eax - mov ebp,DWORD [44+esi] - add edx,edi - mov edi,eax - rol edx,11 - add edx,eax - ; R2 34 - xor edi,ebx - xor edi,edx - lea ecx,[1839030562+ebp*1+ecx] - add ecx,edi - rol ecx,16 - mov ebp,DWORD [56+esi] - mov edi,edx - ; R2 35 - lea ebx,[4259657740+ebp*1+ebx] - add ecx,edx - xor edi,eax - xor edi,ecx - mov ebp,DWORD [4+esi] - add ebx,edi - mov edi,ecx - rol ebx,23 - add ebx,ecx - ; R2 36 - xor edi,edx - xor edi,ebx - lea eax,[2763975236+ebp*1+eax] - add eax,edi - rol eax,4 - mov ebp,DWORD [16+esi] - mov edi,ebx - ; R2 37 - lea edx,[1272893353+ebp*1+edx] - add eax,ebx - xor edi,ecx - xor edi,eax - mov ebp,DWORD [28+esi] - add edx,edi - mov edi,eax - rol edx,11 - add edx,eax - ; R2 38 - xor edi,ebx - xor edi,edx - lea ecx,[4139469664+ebp*1+ecx] - add ecx,edi - rol ecx,16 - mov ebp,DWORD [40+esi] - mov edi,edx - ; R2 39 - lea ebx,[3200236656+ebp*1+ebx] - add ecx,edx - xor edi,eax - xor edi,ecx - mov ebp,DWORD [52+esi] - add ebx,edi - mov edi,ecx - rol ebx,23 - add ebx,ecx - ; R2 40 - xor edi,edx - xor edi,ebx - lea eax,[681279174+ebp*1+eax] - add eax,edi - rol eax,4 - mov ebp,DWORD [esi] - mov edi,ebx - ; R2 41 - lea edx,[3936430074+ebp*1+edx] - add eax,ebx - xor edi,ecx - xor edi,eax - mov ebp,DWORD [12+esi] - add edx,edi - mov edi,eax - rol edx,11 - add edx,eax - ; R2 42 - xor edi,ebx - xor edi,edx - lea ecx,[3572445317+ebp*1+ecx] - add ecx,edi - rol ecx,16 - mov ebp,DWORD [24+esi] - mov edi,edx - ; R2 43 - lea ebx,[76029189+ebp*1+ebx] - add ecx,edx - xor edi,eax - xor edi,ecx - mov ebp,DWORD [36+esi] - add ebx,edi - mov edi,ecx - rol ebx,23 - add ebx,ecx - ; R2 44 - xor edi,edx - xor edi,ebx - lea eax,[3654602809+ebp*1+eax] - add eax,edi - rol eax,4 - mov ebp,DWORD [48+esi] - mov edi,ebx - ; R2 45 - lea edx,[3873151461+ebp*1+edx] - add eax,ebx - xor edi,ecx - xor edi,eax - mov ebp,DWORD [60+esi] - add edx,edi - mov edi,eax - rol edx,11 - add edx,eax - ; R2 46 - xor edi,ebx - xor edi,edx - lea ecx,[530742520+ebp*1+ecx] - add ecx,edi - rol ecx,16 - mov ebp,DWORD [8+esi] - mov edi,edx - ; R2 47 - lea ebx,[3299628645+ebp*1+ebx] - add ecx,edx - xor edi,eax - xor edi,ecx - mov ebp,DWORD [esi] - add ebx,edi - mov edi,-1 - rol ebx,23 - add ebx,ecx - ; - ; R3 section - ; R3 48 - xor edi,edx - or edi,ebx - lea eax,[4096336452+ebp*1+eax] - xor edi,ecx - mov ebp,DWORD [28+esi] - add eax,edi - mov edi,-1 - rol eax,6 - xor edi,ecx - add eax,ebx - ; R3 49 - or edi,eax - lea edx,[1126891415+ebp*1+edx] - xor edi,ebx - mov ebp,DWORD [56+esi] - add edx,edi - mov edi,-1 - rol edx,10 - xor edi,ebx - add edx,eax - ; R3 50 - or edi,edx - lea ecx,[2878612391+ebp*1+ecx] - xor edi,eax - mov ebp,DWORD [20+esi] - add ecx,edi - mov edi,-1 - rol ecx,15 - xor edi,eax - add ecx,edx - ; R3 51 - or edi,ecx - lea ebx,[4237533241+ebp*1+ebx] - xor edi,edx - mov ebp,DWORD [48+esi] - add ebx,edi - mov edi,-1 - rol ebx,21 - xor edi,edx - add ebx,ecx - ; R3 52 - or edi,ebx - lea eax,[1700485571+ebp*1+eax] - xor edi,ecx - mov ebp,DWORD [12+esi] - add eax,edi - mov edi,-1 - rol eax,6 - xor edi,ecx - add eax,ebx - ; R3 53 - or edi,eax - lea edx,[2399980690+ebp*1+edx] - xor edi,ebx - mov ebp,DWORD [40+esi] - add edx,edi - mov edi,-1 - rol edx,10 - xor edi,ebx - add edx,eax - ; R3 54 - or edi,edx - lea ecx,[4293915773+ebp*1+ecx] - xor edi,eax - mov ebp,DWORD [4+esi] - add ecx,edi - mov edi,-1 - rol ecx,15 - xor edi,eax - add ecx,edx - ; R3 55 - or edi,ecx - lea ebx,[2240044497+ebp*1+ebx] - xor edi,edx - mov ebp,DWORD [32+esi] - add ebx,edi - mov edi,-1 - rol ebx,21 - xor edi,edx - add ebx,ecx - ; R3 56 - or edi,ebx - lea eax,[1873313359+ebp*1+eax] - xor edi,ecx - mov ebp,DWORD [60+esi] - add eax,edi - mov edi,-1 - rol eax,6 - xor edi,ecx - add eax,ebx - ; R3 57 - or edi,eax - lea edx,[4264355552+ebp*1+edx] - xor edi,ebx - mov ebp,DWORD [24+esi] - add edx,edi - mov edi,-1 - rol edx,10 - xor edi,ebx - add edx,eax - ; R3 58 - or edi,edx - lea ecx,[2734768916+ebp*1+ecx] - xor edi,eax - mov ebp,DWORD [52+esi] - add ecx,edi - mov edi,-1 - rol ecx,15 - xor edi,eax - add ecx,edx - ; R3 59 - or edi,ecx - lea ebx,[1309151649+ebp*1+ebx] - xor edi,edx - mov ebp,DWORD [16+esi] - add ebx,edi - mov edi,-1 - rol ebx,21 - xor edi,edx - add ebx,ecx - ; R3 60 - or edi,ebx - lea eax,[4149444226+ebp*1+eax] - xor edi,ecx - mov ebp,DWORD [44+esi] - add eax,edi - mov edi,-1 - rol eax,6 - xor edi,ecx - add eax,ebx - ; R3 61 - or edi,eax - lea edx,[3174756917+ebp*1+edx] - xor edi,ebx - mov ebp,DWORD [8+esi] - add edx,edi - mov edi,-1 - rol edx,10 - xor edi,ebx - add edx,eax - ; R3 62 - or edi,edx - lea ecx,[718787259+ebp*1+ecx] - xor edi,eax - mov ebp,DWORD [36+esi] - add ecx,edi - mov edi,-1 - rol ecx,15 - xor edi,eax - add ecx,edx - ; R3 63 - or edi,ecx - lea ebx,[3951481745+ebp*1+ebx] - xor edi,edx - mov ebp,DWORD [24+esp] - add ebx,edi - add esi,64 - rol ebx,21 - mov edi,DWORD [ebp] - add ebx,ecx - add eax,edi - mov edi,DWORD [4+ebp] - add ebx,edi - mov edi,DWORD [8+ebp] - add ecx,edi - mov edi,DWORD [12+ebp] - add edx,edi - mov DWORD [ebp],eax - mov DWORD [4+ebp],ebx - mov edi,DWORD [esp] - mov DWORD [8+ebp],ecx - mov DWORD [12+ebp],edx - cmp edi,esi - jae NEAR L$000start - pop eax - pop ebx - pop ebp - pop edi - pop esi - ret -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cbc.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cbc.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cbc.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cbc.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cfb.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cfb.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cfb.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cfb.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ctr.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ctr.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ctr.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ctr.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm_nohw.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm_nohw.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm_nohw.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm_nohw.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ofb.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ofb.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ofb.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ofb.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/polyval.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/polyval.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/polyval.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/polyval.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/ctrdrbg.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/ctrdrbg.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/ctrdrbg.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/ctrdrbg.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/internal.h index 7be2ad3d..b661d123 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/internal.h @@ -18,92 +18,13 @@ #include #include -#include "../../internal.h" +#include "../../bcm_support.h" #include "../modes/internal.h" #if defined(__cplusplus) extern "C" { #endif - -#if defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE) -#define OPENSSL_RAND_DETERMINISTIC -#elif defined(OPENSSL_TRUSTY) -#define OPENSSL_RAND_TRUSTY -#elif defined(OPENSSL_WINDOWS) -#define OPENSSL_RAND_WINDOWS -#elif defined(OPENSSL_LINUX) -#define OPENSSL_RAND_URANDOM -#elif defined(OPENSSL_APPLE) && !defined(OPENSSL_MACOS) -// Unlike macOS, iOS and similar hide away getentropy(). -#define OPENSSL_RAND_IOS -#else -// By default if you are integrating BoringSSL we expect you to -// provide getentropy from the header file. -#define OPENSSL_RAND_GETENTROPY -#endif - -// RAND_bytes_with_additional_data samples from the RNG after mixing 32 bytes -// from |user_additional_data| in. -void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len, - const uint8_t user_additional_data[32]); - -#if defined(BORINGSSL_FIPS) - -// We overread from /dev/urandom or RDRAND by a factor of 10 and XOR to whiten. -#define BORINGSSL_FIPS_OVERREAD 10 - -// CRYPTO_get_seed_entropy writes |out_entropy_len| bytes of entropy, suitable -// for seeding a DRBG, to |out_entropy|. It sets |*out_used_cpu| to one if the -// entropy came directly from the CPU and zero if it came from the OS. It -// actively obtains entropy from the CPU/OS and so should not be called from -// within the FIPS module. -void CRYPTO_get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len, - int *out_used_cpu); - -// RAND_load_entropy supplies |entropy_len| bytes of entropy to the module. The -// |want_additional_input| parameter is true iff the entropy was obtained from -// a source other than the system, e.g. directly from the CPU. -void RAND_load_entropy(const uint8_t *entropy, size_t entropy_len, - int want_additional_input); - -// RAND_need_entropy is implemented outside of the FIPS module and is called -// when the module has stopped because it has run out of entropy. -void RAND_need_entropy(size_t bytes_needed); - -#endif // BORINGSSL_FIPS - -// CRYPTO_sysrand fills |len| bytes at |buf| with entropy from the operating -// system. -void CRYPTO_sysrand(uint8_t *buf, size_t len); - -// CRYPTO_sysrand_for_seed fills |len| bytes at |buf| with entropy from the -// operating system. It may draw from the |GRND_RANDOM| pool on Android, -// depending on the vendor's configuration. -void CRYPTO_sysrand_for_seed(uint8_t *buf, size_t len); - -#if defined(OPENSSL_RAND_URANDOM) || defined(OPENSSL_RAND_WINDOWS) -// CRYPTO_init_sysrand initializes long-lived resources needed to draw entropy -// from the operating system. -void CRYPTO_init_sysrand(void); -#else -OPENSSL_INLINE void CRYPTO_init_sysrand(void) {} -#endif // defined(OPENSSL_RAND_URANDOM) || defined(OPENSSL_RAND_WINDOWS) - -#if defined(OPENSSL_RAND_URANDOM) -// CRYPTO_sysrand_if_available fills |len| bytes at |buf| with entropy from the -// operating system, or early /dev/urandom data, and returns 1, _if_ the entropy -// pool is initialized or if getrandom() is not available and not in FIPS mode. -// Otherwise it will not block and will instead fill |buf| with all zeros and -// return 0. -int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len); -#else -OPENSSL_INLINE int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) { - CRYPTO_sysrand(buf, len); - return 1; -} -#endif // defined(OPENSSL_RAND_URANDOM) - // rand_fork_unsafe_buffering_enabled returns whether fork-unsafe buffering has // been enabled via |RAND_enable_fork_unsafe_buffering|. int rand_fork_unsafe_buffering_enabled(void); diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c.inc similarity index 92% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c.inc index 1a8b198f..38ed6857 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c.inc @@ -12,8 +12,6 @@ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include - #include #include #include @@ -26,10 +24,10 @@ #include #include -#include "internal.h" -#include "fork_detect.h" -#include "../../internal.h" +#include "../../bcm_support.h" +#include "../bcm_interface.h" #include "../delocate.h" +#include "internal.h" // It's assumed that the operating system always has an unfailing source of @@ -99,7 +97,7 @@ static void rand_thread_state_clear_all(void) { CTR_DRBG_clear(&cur->drbg); } // The locks are deliberately left locked so that any threads that are still - // running will hang if they try to call |RAND_bytes|. It also ensures + // running will hang if they try to call |BCM_rand_bytes|. It also ensures // |rand_thread_state_free| cannot free any thread state while we've taken the // lock. } @@ -164,26 +162,24 @@ static int rdrand(uint8_t *buf, const size_t len) { #else -static int rdrand(uint8_t *buf, size_t len) { - return 0; -} +static int rdrand(uint8_t *buf, size_t len) { return 0; } #endif -#if defined(BORINGSSL_FIPS) - -void CRYPTO_get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len, - int *out_want_additional_input) { - *out_want_additional_input = 0; - if (have_rdrand() && rdrand(out_entropy, out_entropy_len)) { - *out_want_additional_input = 1; - } else { - CRYPTO_sysrand_for_seed(out_entropy, out_entropy_len); +bcm_status BCM_rand_bytes_hwrng(uint8_t *buf, const size_t len) { + if (!have_rdrand()) { + return bcm_status_failure; } + if (rdrand(buf, len)) { + return bcm_status_not_approved; + } + return bcm_status_failure; } +#if defined(BORINGSSL_FIPS) + // In passive entropy mode, entropy is supplied from outside of the module via -// |RAND_load_entropy| and is stored in global instance of the following +// |BCM_rand_load_entropy| and is stored in global instance of the following // structure. struct entropy_buffer { @@ -202,8 +198,8 @@ struct entropy_buffer { DEFINE_BSS_GET(struct entropy_buffer, entropy_buffer); DEFINE_STATIC_MUTEX(entropy_buffer_lock); -void RAND_load_entropy(const uint8_t *entropy, size_t entropy_len, - int want_additional_input) { +bcm_infallible BCM_rand_load_entropy(const uint8_t *entropy, size_t entropy_len, + int want_additional_input) { struct entropy_buffer *const buffer = entropy_buffer_bss_get(); CRYPTO_MUTEX_lock_write(entropy_buffer_lock_bss_get()); @@ -214,9 +210,9 @@ void RAND_load_entropy(const uint8_t *entropy, size_t entropy_len, OPENSSL_memcpy(&buffer->bytes[buffer->bytes_valid], entropy, entropy_len); buffer->bytes_valid += entropy_len; - buffer->want_additional_input |= - want_additional_input && (entropy_len != 0); + buffer->want_additional_input |= want_additional_input && (entropy_len != 0); CRYPTO_MUTEX_unlock_write(entropy_buffer_lock_bss_get()); + return bcm_infallible_not_approved; } // get_seed_entropy fills |out_entropy_len| bytes of |out_entropy| from the @@ -330,10 +326,10 @@ static void rand_get_seed(struct rand_thread_state *state, #endif -void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len, - const uint8_t user_additional_data[32]) { +bcm_infallible BCM_rand_bytes_with_additional_data( + uint8_t *out, size_t out_len, const uint8_t user_additional_data[32]) { if (out_len == 0) { - return; + return bcm_infallible_approved; } const uint64_t fork_generation = CRYPTO_get_fork_generation(); @@ -473,21 +469,11 @@ void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len, #if defined(BORINGSSL_FIPS) CRYPTO_MUTEX_unlock_read(&state->clear_drbg_lock); #endif + return bcm_infallible_approved; } -int RAND_bytes(uint8_t *out, size_t out_len) { +bcm_infallible BCM_rand_bytes(uint8_t *out, size_t out_len) { static const uint8_t kZeroAdditionalData[32] = {0}; - RAND_bytes_with_additional_data(out, out_len, kZeroAdditionalData); - return 1; -} - -int RAND_pseudo_bytes(uint8_t *buf, size_t len) { - return RAND_bytes(buf, len); -} - -void RAND_get_system_entropy_for_custom_prng(uint8_t *buf, size_t len) { - if (len > 256) { - abort(); - } - CRYPTO_sysrand_for_seed(buf, len); + BCM_rand_bytes_with_additional_data(out, out_len, kZeroAdditionalData); + return bcm_infallible_approved; } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/blinding.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/blinding.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/blinding.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/blinding.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c.inc similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c.inc index 72544134..1ac8b31b 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c.inc @@ -63,11 +63,11 @@ #include #include #include -#include #include #include "internal.h" #include "../service_indicator/internal.h" +#include "../bcm_interface.h" #include "../../internal.h" @@ -369,9 +369,7 @@ int RSA_padding_add_PKCS1_PSS_mgf1(const RSA *rsa, unsigned char *EM, if (!salt) { goto err; } - if (!RAND_bytes(salt, sLen)) { - goto err; - } + BCM_rand_bytes(salt, sLen); } maskedDBLen = emLen - hLen - 1; H = EM + maskedDBLen; @@ -394,7 +392,6 @@ int RSA_padding_add_PKCS1_PSS_mgf1(const RSA *rsa, unsigned char *EM, } p = EM; - // Initial PS XORs with all zeroes which is a NOP so just update // pointer. Note from a test above this value is guaranteed to // be non-negative. diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c.inc similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c.inc index fc2d5b2a..eea8988b 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c.inc @@ -480,7 +480,7 @@ static const struct pkcs1_sig_prefix kPKCS1SigPrefixes[] = { }, { NID_sha1, - SHA_DIGEST_LENGTH, + BCM_SHA_DIGEST_LENGTH, 15, {0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03, 0x02, 0x1a, 0x05, 0x00, 0x04, 0x14}, @@ -758,7 +758,8 @@ int RSA_verify_pss_mgf1(RSA *rsa, const uint8_t *digest, size_t digest_len, static int check_mod_inverse(int *out_ok, const BIGNUM *a, const BIGNUM *ainv, const BIGNUM *m, unsigned m_min_bits, BN_CTX *ctx) { - if (BN_is_negative(ainv) || BN_cmp(ainv, m) >= 0) { + if (BN_is_negative(ainv) || + constant_time_declassify_int(BN_cmp(ainv, m) >= 0)) { *out_ok = 0; return 1; } @@ -772,7 +773,7 @@ static int check_mod_inverse(int *out_ok, const BIGNUM *a, const BIGNUM *ainv, bn_mul_consttime(tmp, a, ainv, ctx) && bn_div_consttime(NULL, tmp, tmp, m, m_min_bits, ctx); if (ret) { - *out_ok = BN_is_one(tmp); + *out_ok = constant_time_declassify_int(BN_is_one(tmp)); } BN_CTX_end(ctx); return ret; @@ -831,8 +832,10 @@ int RSA_check_key(const RSA *key) { // bounds, to avoid a DoS vector in |bn_mul_consttime| below. Note that // n was bound by |rsa_check_public_key|. This also implicitly checks p and q // are odd, which is a necessary condition for Montgomery reduction. - if (BN_is_negative(key->p) || BN_cmp(key->p, key->n) >= 0 || - BN_is_negative(key->q) || BN_cmp(key->q, key->n) >= 0) { + if (BN_is_negative(key->p) || + constant_time_declassify_int(BN_cmp(key->p, key->n) >= 0) || + BN_is_negative(key->q) || + constant_time_declassify_int(BN_cmp(key->q, key->n) >= 0)) { OPENSSL_PUT_ERROR(RSA, RSA_R_N_NOT_EQUAL_P_Q); goto out; } @@ -863,7 +866,8 @@ int RSA_check_key(const RSA *key) { goto out; } - if (!BN_is_one(&tmp) || !BN_is_one(&de)) { + if (constant_time_declassify_int(!BN_is_one(&tmp)) || + constant_time_declassify_int(!BN_is_one(&de))) { OPENSSL_PUT_ERROR(RSA, RSA_R_D_E_NOT_CONGRUENT_TO_1); goto out; } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c.inc similarity index 97% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c.inc index 50dc1f32..c13d979f 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c.inc @@ -65,10 +65,10 @@ #include #include +#include "../../bcm_support.h" #include "../../internal.h" #include "../bn/internal.h" #include "../delocate.h" -#include "../rand/fork_detect.h" #include "../service_indicator/internal.h" #include "internal.h" @@ -795,7 +795,7 @@ static int mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx) { // This is a pre-condition for |mod_montgomery|. It was already checked by the // caller. - assert(BN_ucmp(I, n) < 0); + declassify_assert(BN_ucmp(I, n) < 0); if (// |m1| is the result modulo |q|. !mod_montgomery(r1, I, q, rsa->mont_q, p, ctx) || @@ -831,7 +831,7 @@ static int mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx) { // bound the width slightly higher, so fix it. This trips constant-time checks // because a naive data flow analysis does not realize the excess words are // publicly zero. - assert(BN_cmp(r0, n) < 0); + declassify_assert(BN_cmp(r0, n) < 0); bn_assert_fits_in_bytes(r0, BN_num_bytes(n)); if (!bn_resize_words(r0, n->width)) { goto err; @@ -1003,20 +1003,25 @@ static int generate_prime(BIGNUM *out, int bits, const BIGNUM *e, // retrying. That is, we reject a negligible fraction of primes that are // within the FIPS bound, but we will never accept a prime outside the // bound, ensuring the resulting RSA key is the right size. - if (BN_cmp(out, sqrt2) <= 0) { + // + // Values over the threshold are discarded, so it is safe to leak this + // comparison. + if (constant_time_declassify_int(BN_cmp(out, sqrt2) <= 0)) { continue; } // RSA key generation's bottleneck is discarding composites. If it fails // trial division, do not bother computing a GCD or performing Miller-Rabin. if (!bn_odd_number_is_obviously_composite(out)) { - // Check gcd(out-1, e) is one (steps 4.5 and 5.6). + // Check gcd(out-1, e) is one (steps 4.5 and 5.6). Leaking the final + // result of this comparison is safe because, if not relatively prime, the + // value will be discarded. int relatively_prime; - if (!BN_sub(tmp, out, BN_value_one()) || + if (!bn_usub_consttime(tmp, out, BN_value_one()) || !bn_is_relatively_prime(&relatively_prime, tmp, e, ctx)) { goto err; } - if (relatively_prime) { + if (constant_time_declassify_int(relatively_prime)) { // Test |out| for primality (steps 4.5.1 and 5.6.1). int is_probable_prime; if (!BN_primality_test(&is_probable_prime, out, @@ -1174,8 +1179,9 @@ static int rsa_generate_key_impl(RSA *rsa, int bits, const BIGNUM *e_value, } // Retry if |rsa->d| <= 2^|prime_bits|. See appendix B.3.1's guidance on - // values for d. - } while (BN_cmp(rsa->d, pow2_prime_bits) <= 0); + // values for d. When we retry, p and q are discarded, so it is safe to leak + // this comparison. + } while (constant_time_declassify_int(BN_cmp(rsa->d, pow2_prime_bits) <= 0)); assert(BN_num_bits(pm1) == (unsigned)prime_bits); assert(BN_num_bits(qm1) == (unsigned)prime_bits); @@ -1189,6 +1195,9 @@ static int rsa_generate_key_impl(RSA *rsa, int bits, const BIGNUM *e_value, } bn_set_minimal_width(rsa->n); + // |rsa->n| is computed from the private key, but is public. + bn_declassify(rsa->n); + // Sanity-check that |rsa->n| has the specified size. This is implied by // |generate_prime|'s bounds. if (BN_num_bits(rsa->n) != (unsigned)bits) { @@ -1241,6 +1250,11 @@ static int RSA_generate_key_ex_maybe_fips(RSA *rsa, int bits, int check_fips) { boringssl_ensure_rsa_self_test(); + if (rsa == NULL) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + RSA *tmp = NULL; uint32_t err; int ret = 0; diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c.inc similarity index 94% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c.inc index b3282863..9bd9e7bd 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c.inc @@ -72,7 +72,8 @@ int FIPS_query_algorithm_status(const char *algorithm) { #if defined(BORINGSSL_FIPS_COUNTERS) size_t FIPS_read_counter(enum fips_counter_t counter) { - if (counter < 0 || counter > fips_counter_max) { + size_t index = (size_t)counter; + if (index > fips_counter_max) { abort(); } @@ -82,11 +83,12 @@ size_t FIPS_read_counter(enum fips_counter_t counter) { return 0; } - return array[counter]; + return array[index]; } void boringssl_fips_inc_counter(enum fips_counter_t counter) { - if (counter < 0 || counter > fips_counter_max) { + size_t index = (size_t)counter; + if (index > fips_counter_max) { abort(); } @@ -106,7 +108,7 @@ void boringssl_fips_inc_counter(enum fips_counter_t counter) { } } - array[counter]++; + array[index]++; } #else diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c.inc similarity index 97% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c.inc index 26439fc5..8b63d0b8 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c.inc @@ -33,6 +33,7 @@ #include #include "../../internal.h" +#include "../delocate.h" #include "../dh/internal.h" #include "../ec/internal.h" #include "../ecdsa/internal.h" @@ -77,28 +78,6 @@ static int set_bignum(BIGNUM **out, const uint8_t *in, size_t len) { return *out != NULL; } -static int serialize_ecdsa_sig(uint8_t *out, size_t out_len, - const ECDSA_SIG *sig) { - if ((out_len & 1) || // - !BN_bn2bin_padded(out, out_len / 2, sig->r) || - !BN_bn2bin_padded(out + out_len / 2, out_len / 2, sig->s)) { - return 0; - } - return 1; -} - -static ECDSA_SIG *parse_ecdsa_sig(const uint8_t *in, size_t in_len) { - ECDSA_SIG *ret = ECDSA_SIG_new(); - if (!ret || // - (in_len & 1) || - BN_bin2bn(in, in_len/2, ret->r) == NULL || - BN_bin2bn(in + in_len/2, in_len/2, ret->s) == NULL) { - ECDSA_SIG_free(ret); - ret = NULL; - } - return ret; -} - static RSA *self_test_rsa_key(void) { static const uint8_t kN[] = { 0xd3, 0x3a, 0x62, 0x9f, 0x07, 0x77, 0xb0, 0x18, 0xf3, 0xff, 0xfe, 0xcc, @@ -415,7 +394,6 @@ static int boringssl_self_test_ecc(void) { EC_POINT *ec_point_in = NULL; EC_POINT *ec_point_out = NULL; BIGNUM *ec_scalar = NULL; - ECDSA_SIG *sig = NULL; ec_key = self_test_ecdsa_key(); if (ec_key == NULL) { @@ -443,13 +421,12 @@ static int boringssl_self_test_ecc(void) { uint8_t ecdsa_k[32] = {0}; ecdsa_k[31] = 42; - sig = ecdsa_sign_with_nonce_for_known_answer_test( - kECDSASignDigest, sizeof(kECDSASignDigest), ec_key, ecdsa_k, - sizeof(ecdsa_k)); - uint8_t ecdsa_sign_output[64]; - if (sig == NULL || - !serialize_ecdsa_sig(ecdsa_sign_output, sizeof(ecdsa_sign_output), sig) || + size_t ecdsa_sign_output_len; + if (!ecdsa_sign_fixed_with_nonce_for_known_answer_test( + kECDSASignDigest, sizeof(kECDSASignDigest), ecdsa_sign_output, + &ecdsa_sign_output_len, sizeof(ecdsa_sign_output), ec_key, ecdsa_k, + sizeof(ecdsa_k)) || !check_test(kECDSASignSig, ecdsa_sign_output, sizeof(ecdsa_sign_output), "ECDSA-sign signature")) { fprintf(stderr, "ECDSA-sign KAT failed.\n"); @@ -470,11 +447,9 @@ static int boringssl_self_test_ecc(void) { 0x8e, 0x5f, 0x64, 0xc3, 0x7e, 0xa2, 0xcf, 0x05, 0x29, }; - ECDSA_SIG_free(sig); - sig = parse_ecdsa_sig(kECDSAVerifySig, sizeof(kECDSAVerifySig)); - if (!sig || - !ecdsa_do_verify_no_self_test(kECDSAVerifyDigest, - sizeof(kECDSAVerifyDigest), sig, ec_key)) { + if (!ecdsa_verify_fixed_no_self_test( + kECDSAVerifyDigest, sizeof(kECDSAVerifyDigest), kECDSAVerifySig, + sizeof(kECDSAVerifySig), ec_key)) { fprintf(stderr, "ECDSA-verify KAT failed.\n"); goto err; } @@ -532,7 +507,6 @@ static int boringssl_self_test_ecc(void) { EC_POINT_free(ec_point_in); EC_POINT_free(ec_point_out); BN_free(ec_scalar); - ECDSA_SIG_free(sig); return ret; } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/internal.h index 5b6de8ad..c82eb332 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/internal.h @@ -28,8 +28,8 @@ void FIPS_service_indicator_update_state(void); // stop |FIPS_service_indicator_update_state| from actually updating the service // indicator. This is used when a primitive calls a potentially approved // primitive to avoid false positives. For example, just because a key -// generation calls |RAND_bytes| (and thus the approved DRBG) doesn't mean that -// the key generation operation itself is approved. +// generation calls |BCM_rand_bytes| (and thus the approved DRBG) doesn't mean +// that the key generation operation itself is approved. // // This lock nests: i.e. locking twice is fine so long as each lock is paired // with an unlock. If the (64-bit) counter overflows, the process aborts. diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c.inc similarity index 93% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c.inc index aff9b54e..71db464f 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c.inc @@ -171,7 +171,6 @@ static int is_md_fips_approved_for_signing(int md_type) { // type is FIPS approved for verifying, and zero otherwise. static int is_md_fips_approved_for_verifying(int md_type) { switch (md_type) { - case NID_sha1: case NID_sha224: case NID_sha256: case NID_sha384: @@ -184,7 +183,6 @@ static int is_md_fips_approved_for_verifying(int md_type) { } static void evp_md_ctx_verify_service_indicator(const EVP_MD_CTX *ctx, - int rsa_1024_ok, int (*md_ok)(int md_type)) { if (EVP_MD_CTX_md(ctx) == NULL) { // Signature schemes without a prehash are currently never FIPS approved. @@ -232,8 +230,7 @@ static void evp_md_ctx_verify_service_indicator(const EVP_MD_CTX *ctx, // Check if the MD type and the RSA key size are approved. if (md_ok(md_type) && - ((rsa_1024_ok && pkey_size == 128) || pkey_size == 256 || - pkey_size == 384 || pkey_size == 512)) { + (pkey_size == 256 || pkey_size == 384 || pkey_size == 512)) { FIPS_service_indicator_update_state(); } } else if (pkey_type == EVP_PKEY_EC) { @@ -251,7 +248,7 @@ static void evp_md_ctx_verify_service_indicator(const EVP_MD_CTX *ctx, } void EC_KEY_keygen_verify_service_indicator(const EC_KEY *eckey) { - if (is_ec_fips_approved(EC_GROUP_get_curve_name(eckey->group))) { + if (is_ec_fips_approved(EC_GROUP_get_curve_name(EC_KEY_get0_group(eckey)))) { FIPS_service_indicator_update_state(); } } @@ -280,17 +277,17 @@ void EVP_Cipher_verify_service_indicator(const EVP_CIPHER_CTX *ctx) { } void EVP_DigestVerify_verify_service_indicator(const EVP_MD_CTX *ctx) { - return evp_md_ctx_verify_service_indicator(ctx, /*rsa_1024_ok=*/1, + return evp_md_ctx_verify_service_indicator(ctx, is_md_fips_approved_for_verifying); } void EVP_DigestSign_verify_service_indicator(const EVP_MD_CTX *ctx) { - return evp_md_ctx_verify_service_indicator(ctx, /*rsa_1024_ok=*/0, + return evp_md_ctx_verify_service_indicator(ctx, is_md_fips_approved_for_signing); } void HMAC_verify_service_indicator(const EVP_MD *evp_md) { - switch (evp_md->type) { + switch (EVP_MD_type(evp_md)) { case NID_sha1: case NID_sha224: case NID_sha256: @@ -303,12 +300,9 @@ void HMAC_verify_service_indicator(const EVP_MD *evp_md) { } void TLSKDF_verify_service_indicator(const EVP_MD *md) { - // HMAC-MD5/HMAC-SHA1 (both used concurrently) is approved for use in the KDF - // in TLS 1.0/1.1. HMAC-SHA{256, 384, 512} are approved for use in the KDF in - // TLS 1.2. These Key Derivation functions are to be used in the context of - // the TLS protocol. + // HMAC-SHA{256, 384, 512} are approved for use in the KDF in TLS 1.2. These + // Key Derivation functions are to be used in the context of the TLS protocol. switch (EVP_MD_type(md)) { - case NID_md5_sha1: case NID_sha256: case NID_sha384: case NID_sha512: diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/internal.h index 6ac869cb..977c4041 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/internal.h @@ -80,6 +80,7 @@ OPENSSL_INLINE int sha512_hw_capable(void) { #define SHA1_ASM_NOHW #define SHA256_ASM_NOHW +#define SHA512_ASM_NOHW #define SHA1_ASM_SSSE3 OPENSSL_INLINE int sha1_ssse3_capable(void) { @@ -127,10 +128,14 @@ OPENSSL_INLINE int sha256_avx_capable(void) { void sha256_block_data_order_avx(uint32_t state[8], const uint8_t *data, size_t num); -// TODO(crbug.com/boringssl/673): Move the remaining CPU dispatch to C. -#define SHA512_ASM -void sha512_block_data_order(uint64_t state[8], const uint8_t *data, - size_t num_blocks); +#define SHA512_ASM_SSSE3 +OPENSSL_INLINE int sha512_ssse3_capable(void) { + // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not + // say to. + return CRYPTO_is_SSSE3_capable() && CRYPTO_is_FXSR_capable(); +} +void sha512_block_data_order_ssse3(uint64_t state[8], const uint8_t *data, + size_t num); #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c.inc similarity index 96% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c.inc index 14d9390f..5f14c166 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c.inc @@ -54,35 +54,25 @@ * copied and put under another distribution licence * [including the GNU Public Licence.] */ -#include - #include #include +#include "../bcm_interface.h" #include "../../internal.h" #include "../digest/md32_common.h" #include "../service_indicator/internal.h" #include "internal.h" -int SHA1_Init(SHA_CTX *sha) { +bcm_infallible BCM_sha1_init(SHA_CTX *sha) { OPENSSL_memset(sha, 0, sizeof(SHA_CTX)); sha->h[0] = 0x67452301UL; sha->h[1] = 0xefcdab89UL; sha->h[2] = 0x98badcfeUL; sha->h[3] = 0x10325476UL; sha->h[4] = 0xc3d2e1f0UL; - return 1; -} - -uint8_t *SHA1(const uint8_t *data, size_t len, uint8_t out[SHA_DIGEST_LENGTH]) { - SHA_CTX ctx; - SHA1_Init(&ctx); - SHA1_Update(&ctx, data, len); - SHA1_Final(out, &ctx); - OPENSSL_cleanse(&ctx, sizeof(ctx)); - return out; + return bcm_infallible_approved; } #if !defined(SHA1_ASM) @@ -90,14 +80,15 @@ static void sha1_block_data_order(uint32_t state[5], const uint8_t *data, size_t num); #endif -void SHA1_Transform(SHA_CTX *c, const uint8_t data[SHA_CBLOCK]) { +bcm_infallible BCM_sha1_transform(SHA_CTX *c, const uint8_t data[SHA_CBLOCK]) { sha1_block_data_order(c->h, data, 1); + return bcm_infallible_approved; } -int SHA1_Update(SHA_CTX *c, const void *data, size_t len) { +bcm_infallible BCM_sha1_update(SHA_CTX *c, const void *data, size_t len) { crypto_md32_update(&sha1_block_data_order, c->h, c->data, SHA_CBLOCK, &c->num, &c->Nh, &c->Nl, data, len); - return 1; + return bcm_infallible_approved; } static void sha1_output_state(uint8_t out[SHA_DIGEST_LENGTH], @@ -109,16 +100,16 @@ static void sha1_output_state(uint8_t out[SHA_DIGEST_LENGTH], CRYPTO_store_u32_be(out + 16, ctx->h[4]); } -int SHA1_Final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *c) { +bcm_infallible BCM_sha1_final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *c) { crypto_md32_final(&sha1_block_data_order, c->h, c->data, SHA_CBLOCK, &c->num, c->Nh, c->Nl, /*is_big_endian=*/1); sha1_output_state(out, c); FIPS_service_indicator_update_state(); - return 1; + return bcm_infallible_approved; } -void CRYPTO_fips_186_2_prf(uint8_t *out, size_t out_len, +bcm_infallible BCM_fips_186_2_prf(uint8_t *out, size_t out_len, const uint8_t xkey[SHA_DIGEST_LENGTH]) { // XKEY and XVAL are 160-bit values, but are internally right-padded up to // block size. See FIPS 186-2, Appendix 3.3. This buffer maintains both the @@ -130,8 +121,8 @@ void CRYPTO_fips_186_2_prf(uint8_t *out, size_t out_len, // We always use a zero XSEED, so we can merge the inner and outer loops. // XVAL is also always equal to XKEY. SHA_CTX ctx; - SHA1_Init(&ctx); - SHA1_Transform(&ctx, block); + BCM_sha1_init(&ctx); + BCM_sha1_transform(&ctx, block); // XKEY = (1 + XKEY + w_i) mod 2^b uint32_t carry = 1; @@ -152,6 +143,7 @@ void CRYPTO_fips_186_2_prf(uint8_t *out, size_t out_len, out += SHA_DIGEST_LENGTH; out_len -= SHA_DIGEST_LENGTH; } + return bcm_infallible_not_approved; } #define Xupdate(a, ix, ia, ib, ic, id) \ diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha256.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha256.c.inc similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha256.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha256.c.inc diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c.inc similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c.inc index 8135457a..ea73199d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c.inc @@ -516,6 +516,12 @@ static void sha512_block_data_order(uint64_t state[8], const uint8_t *data, return; } #endif +#if defined(SHA512_ASM_SSSE3) + if (sha512_ssse3_capable()) { + sha512_block_data_order_ssse3(state, data, num); + return; + } +#endif #if defined(SHA512_ASM_NEON) if (CRYPTO_is_NEON_capable()) { sha512_block_data_order_neon(state, data, num); diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-windows.windows.x86.S deleted file mode 100644 index 3622e531..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-windows.windows.x86.S +++ /dev/null @@ -1,3797 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -global _sha1_block_data_order_nohw -align 16 -_sha1_block_data_order_nohw: -L$_sha1_block_data_order_nohw_begin: - push ebp - push ebx - push esi - push edi - mov ebp,DWORD [20+esp] - mov esi,DWORD [24+esp] - mov eax,DWORD [28+esp] - sub esp,76 - shl eax,6 - add eax,esi - mov DWORD [104+esp],eax - mov edi,DWORD [16+ebp] - jmp NEAR L$000loop -align 16 -L$000loop: - mov eax,DWORD [esi] - mov ebx,DWORD [4+esi] - mov ecx,DWORD [8+esi] - mov edx,DWORD [12+esi] - bswap eax - bswap ebx - bswap ecx - bswap edx - mov DWORD [esp],eax - mov DWORD [4+esp],ebx - mov DWORD [8+esp],ecx - mov DWORD [12+esp],edx - mov eax,DWORD [16+esi] - mov ebx,DWORD [20+esi] - mov ecx,DWORD [24+esi] - mov edx,DWORD [28+esi] - bswap eax - bswap ebx - bswap ecx - bswap edx - mov DWORD [16+esp],eax - mov DWORD [20+esp],ebx - mov DWORD [24+esp],ecx - mov DWORD [28+esp],edx - mov eax,DWORD [32+esi] - mov ebx,DWORD [36+esi] - mov ecx,DWORD [40+esi] - mov edx,DWORD [44+esi] - bswap eax - bswap ebx - bswap ecx - bswap edx - mov DWORD [32+esp],eax - mov DWORD [36+esp],ebx - mov DWORD [40+esp],ecx - mov DWORD [44+esp],edx - mov eax,DWORD [48+esi] - mov ebx,DWORD [52+esi] - mov ecx,DWORD [56+esi] - mov edx,DWORD [60+esi] - bswap eax - bswap ebx - bswap ecx - bswap edx - mov DWORD [48+esp],eax - mov DWORD [52+esp],ebx - mov DWORD [56+esp],ecx - mov DWORD [60+esp],edx - mov DWORD [100+esp],esi - mov eax,DWORD [ebp] - mov ebx,DWORD [4+ebp] - mov ecx,DWORD [8+ebp] - mov edx,DWORD [12+ebp] - ; 00_15 0 - mov esi,ecx - mov ebp,eax - rol ebp,5 - xor esi,edx - add ebp,edi - mov edi,DWORD [esp] - and esi,ebx - ror ebx,2 - xor esi,edx - lea ebp,[1518500249+edi*1+ebp] - add ebp,esi - ; 00_15 1 - mov edi,ebx - mov esi,ebp - rol ebp,5 - xor edi,ecx - add ebp,edx - mov edx,DWORD [4+esp] - and edi,eax - ror eax,2 - xor edi,ecx - lea ebp,[1518500249+edx*1+ebp] - add ebp,edi - ; 00_15 2 - mov edx,eax - mov edi,ebp - rol ebp,5 - xor edx,ebx - add ebp,ecx - mov ecx,DWORD [8+esp] - and edx,esi - ror esi,2 - xor edx,ebx - lea ebp,[1518500249+ecx*1+ebp] - add ebp,edx - ; 00_15 3 - mov ecx,esi - mov edx,ebp - rol ebp,5 - xor ecx,eax - add ebp,ebx - mov ebx,DWORD [12+esp] - and ecx,edi - ror edi,2 - xor ecx,eax - lea ebp,[1518500249+ebx*1+ebp] - add ebp,ecx - ; 00_15 4 - mov ebx,edi - mov ecx,ebp - rol ebp,5 - xor ebx,esi - add ebp,eax - mov eax,DWORD [16+esp] - and ebx,edx - ror edx,2 - xor ebx,esi - lea ebp,[1518500249+eax*1+ebp] - add ebp,ebx - ; 00_15 5 - mov eax,edx - mov ebx,ebp - rol ebp,5 - xor eax,edi - add ebp,esi - mov esi,DWORD [20+esp] - and eax,ecx - ror ecx,2 - xor eax,edi - lea ebp,[1518500249+esi*1+ebp] - add ebp,eax - ; 00_15 6 - mov esi,ecx - mov eax,ebp - rol ebp,5 - xor esi,edx - add ebp,edi - mov edi,DWORD [24+esp] - and esi,ebx - ror ebx,2 - xor esi,edx - lea ebp,[1518500249+edi*1+ebp] - add ebp,esi - ; 00_15 7 - mov edi,ebx - mov esi,ebp - rol ebp,5 - xor edi,ecx - add ebp,edx - mov edx,DWORD [28+esp] - and edi,eax - ror eax,2 - xor edi,ecx - lea ebp,[1518500249+edx*1+ebp] - add ebp,edi - ; 00_15 8 - mov edx,eax - mov edi,ebp - rol ebp,5 - xor edx,ebx - add ebp,ecx - mov ecx,DWORD [32+esp] - and edx,esi - ror esi,2 - xor edx,ebx - lea ebp,[1518500249+ecx*1+ebp] - add ebp,edx - ; 00_15 9 - mov ecx,esi - mov edx,ebp - rol ebp,5 - xor ecx,eax - add ebp,ebx - mov ebx,DWORD [36+esp] - and ecx,edi - ror edi,2 - xor ecx,eax - lea ebp,[1518500249+ebx*1+ebp] - add ebp,ecx - ; 00_15 10 - mov ebx,edi - mov ecx,ebp - rol ebp,5 - xor ebx,esi - add ebp,eax - mov eax,DWORD [40+esp] - and ebx,edx - ror edx,2 - xor ebx,esi - lea ebp,[1518500249+eax*1+ebp] - add ebp,ebx - ; 00_15 11 - mov eax,edx - mov ebx,ebp - rol ebp,5 - xor eax,edi - add ebp,esi - mov esi,DWORD [44+esp] - and eax,ecx - ror ecx,2 - xor eax,edi - lea ebp,[1518500249+esi*1+ebp] - add ebp,eax - ; 00_15 12 - mov esi,ecx - mov eax,ebp - rol ebp,5 - xor esi,edx - add ebp,edi - mov edi,DWORD [48+esp] - and esi,ebx - ror ebx,2 - xor esi,edx - lea ebp,[1518500249+edi*1+ebp] - add ebp,esi - ; 00_15 13 - mov edi,ebx - mov esi,ebp - rol ebp,5 - xor edi,ecx - add ebp,edx - mov edx,DWORD [52+esp] - and edi,eax - ror eax,2 - xor edi,ecx - lea ebp,[1518500249+edx*1+ebp] - add ebp,edi - ; 00_15 14 - mov edx,eax - mov edi,ebp - rol ebp,5 - xor edx,ebx - add ebp,ecx - mov ecx,DWORD [56+esp] - and edx,esi - ror esi,2 - xor edx,ebx - lea ebp,[1518500249+ecx*1+ebp] - add ebp,edx - ; 00_15 15 - mov ecx,esi - mov edx,ebp - rol ebp,5 - xor ecx,eax - add ebp,ebx - mov ebx,DWORD [60+esp] - and ecx,edi - ror edi,2 - xor ecx,eax - lea ebp,[1518500249+ebx*1+ebp] - mov ebx,DWORD [esp] - add ecx,ebp - ; 16_19 16 - mov ebp,edi - xor ebx,DWORD [8+esp] - xor ebp,esi - xor ebx,DWORD [32+esp] - and ebp,edx - xor ebx,DWORD [52+esp] - rol ebx,1 - xor ebp,esi - add eax,ebp - mov ebp,ecx - ror edx,2 - mov DWORD [esp],ebx - rol ebp,5 - lea ebx,[1518500249+eax*1+ebx] - mov eax,DWORD [4+esp] - add ebx,ebp - ; 16_19 17 - mov ebp,edx - xor eax,DWORD [12+esp] - xor ebp,edi - xor eax,DWORD [36+esp] - and ebp,ecx - xor eax,DWORD [56+esp] - rol eax,1 - xor ebp,edi - add esi,ebp - mov ebp,ebx - ror ecx,2 - mov DWORD [4+esp],eax - rol ebp,5 - lea eax,[1518500249+esi*1+eax] - mov esi,DWORD [8+esp] - add eax,ebp - ; 16_19 18 - mov ebp,ecx - xor esi,DWORD [16+esp] - xor ebp,edx - xor esi,DWORD [40+esp] - and ebp,ebx - xor esi,DWORD [60+esp] - rol esi,1 - xor ebp,edx - add edi,ebp - mov ebp,eax - ror ebx,2 - mov DWORD [8+esp],esi - rol ebp,5 - lea esi,[1518500249+edi*1+esi] - mov edi,DWORD [12+esp] - add esi,ebp - ; 16_19 19 - mov ebp,ebx - xor edi,DWORD [20+esp] - xor ebp,ecx - xor edi,DWORD [44+esp] - and ebp,eax - xor edi,DWORD [esp] - rol edi,1 - xor ebp,ecx - add edx,ebp - mov ebp,esi - ror eax,2 - mov DWORD [12+esp],edi - rol ebp,5 - lea edi,[1518500249+edx*1+edi] - mov edx,DWORD [16+esp] - add edi,ebp - ; 20_39 20 - mov ebp,esi - xor edx,DWORD [24+esp] - xor ebp,eax - xor edx,DWORD [48+esp] - xor ebp,ebx - xor edx,DWORD [4+esp] - rol edx,1 - add ecx,ebp - ror esi,2 - mov ebp,edi - rol ebp,5 - mov DWORD [16+esp],edx - lea edx,[1859775393+ecx*1+edx] - mov ecx,DWORD [20+esp] - add edx,ebp - ; 20_39 21 - mov ebp,edi - xor ecx,DWORD [28+esp] - xor ebp,esi - xor ecx,DWORD [52+esp] - xor ebp,eax - xor ecx,DWORD [8+esp] - rol ecx,1 - add ebx,ebp - ror edi,2 - mov ebp,edx - rol ebp,5 - mov DWORD [20+esp],ecx - lea ecx,[1859775393+ebx*1+ecx] - mov ebx,DWORD [24+esp] - add ecx,ebp - ; 20_39 22 - mov ebp,edx - xor ebx,DWORD [32+esp] - xor ebp,edi - xor ebx,DWORD [56+esp] - xor ebp,esi - xor ebx,DWORD [12+esp] - rol ebx,1 - add eax,ebp - ror edx,2 - mov ebp,ecx - rol ebp,5 - mov DWORD [24+esp],ebx - lea ebx,[1859775393+eax*1+ebx] - mov eax,DWORD [28+esp] - add ebx,ebp - ; 20_39 23 - mov ebp,ecx - xor eax,DWORD [36+esp] - xor ebp,edx - xor eax,DWORD [60+esp] - xor ebp,edi - xor eax,DWORD [16+esp] - rol eax,1 - add esi,ebp - ror ecx,2 - mov ebp,ebx - rol ebp,5 - mov DWORD [28+esp],eax - lea eax,[1859775393+esi*1+eax] - mov esi,DWORD [32+esp] - add eax,ebp - ; 20_39 24 - mov ebp,ebx - xor esi,DWORD [40+esp] - xor ebp,ecx - xor esi,DWORD [esp] - xor ebp,edx - xor esi,DWORD [20+esp] - rol esi,1 - add edi,ebp - ror ebx,2 - mov ebp,eax - rol ebp,5 - mov DWORD [32+esp],esi - lea esi,[1859775393+edi*1+esi] - mov edi,DWORD [36+esp] - add esi,ebp - ; 20_39 25 - mov ebp,eax - xor edi,DWORD [44+esp] - xor ebp,ebx - xor edi,DWORD [4+esp] - xor ebp,ecx - xor edi,DWORD [24+esp] - rol edi,1 - add edx,ebp - ror eax,2 - mov ebp,esi - rol ebp,5 - mov DWORD [36+esp],edi - lea edi,[1859775393+edx*1+edi] - mov edx,DWORD [40+esp] - add edi,ebp - ; 20_39 26 - mov ebp,esi - xor edx,DWORD [48+esp] - xor ebp,eax - xor edx,DWORD [8+esp] - xor ebp,ebx - xor edx,DWORD [28+esp] - rol edx,1 - add ecx,ebp - ror esi,2 - mov ebp,edi - rol ebp,5 - mov DWORD [40+esp],edx - lea edx,[1859775393+ecx*1+edx] - mov ecx,DWORD [44+esp] - add edx,ebp - ; 20_39 27 - mov ebp,edi - xor ecx,DWORD [52+esp] - xor ebp,esi - xor ecx,DWORD [12+esp] - xor ebp,eax - xor ecx,DWORD [32+esp] - rol ecx,1 - add ebx,ebp - ror edi,2 - mov ebp,edx - rol ebp,5 - mov DWORD [44+esp],ecx - lea ecx,[1859775393+ebx*1+ecx] - mov ebx,DWORD [48+esp] - add ecx,ebp - ; 20_39 28 - mov ebp,edx - xor ebx,DWORD [56+esp] - xor ebp,edi - xor ebx,DWORD [16+esp] - xor ebp,esi - xor ebx,DWORD [36+esp] - rol ebx,1 - add eax,ebp - ror edx,2 - mov ebp,ecx - rol ebp,5 - mov DWORD [48+esp],ebx - lea ebx,[1859775393+eax*1+ebx] - mov eax,DWORD [52+esp] - add ebx,ebp - ; 20_39 29 - mov ebp,ecx - xor eax,DWORD [60+esp] - xor ebp,edx - xor eax,DWORD [20+esp] - xor ebp,edi - xor eax,DWORD [40+esp] - rol eax,1 - add esi,ebp - ror ecx,2 - mov ebp,ebx - rol ebp,5 - mov DWORD [52+esp],eax - lea eax,[1859775393+esi*1+eax] - mov esi,DWORD [56+esp] - add eax,ebp - ; 20_39 30 - mov ebp,ebx - xor esi,DWORD [esp] - xor ebp,ecx - xor esi,DWORD [24+esp] - xor ebp,edx - xor esi,DWORD [44+esp] - rol esi,1 - add edi,ebp - ror ebx,2 - mov ebp,eax - rol ebp,5 - mov DWORD [56+esp],esi - lea esi,[1859775393+edi*1+esi] - mov edi,DWORD [60+esp] - add esi,ebp - ; 20_39 31 - mov ebp,eax - xor edi,DWORD [4+esp] - xor ebp,ebx - xor edi,DWORD [28+esp] - xor ebp,ecx - xor edi,DWORD [48+esp] - rol edi,1 - add edx,ebp - ror eax,2 - mov ebp,esi - rol ebp,5 - mov DWORD [60+esp],edi - lea edi,[1859775393+edx*1+edi] - mov edx,DWORD [esp] - add edi,ebp - ; 20_39 32 - mov ebp,esi - xor edx,DWORD [8+esp] - xor ebp,eax - xor edx,DWORD [32+esp] - xor ebp,ebx - xor edx,DWORD [52+esp] - rol edx,1 - add ecx,ebp - ror esi,2 - mov ebp,edi - rol ebp,5 - mov DWORD [esp],edx - lea edx,[1859775393+ecx*1+edx] - mov ecx,DWORD [4+esp] - add edx,ebp - ; 20_39 33 - mov ebp,edi - xor ecx,DWORD [12+esp] - xor ebp,esi - xor ecx,DWORD [36+esp] - xor ebp,eax - xor ecx,DWORD [56+esp] - rol ecx,1 - add ebx,ebp - ror edi,2 - mov ebp,edx - rol ebp,5 - mov DWORD [4+esp],ecx - lea ecx,[1859775393+ebx*1+ecx] - mov ebx,DWORD [8+esp] - add ecx,ebp - ; 20_39 34 - mov ebp,edx - xor ebx,DWORD [16+esp] - xor ebp,edi - xor ebx,DWORD [40+esp] - xor ebp,esi - xor ebx,DWORD [60+esp] - rol ebx,1 - add eax,ebp - ror edx,2 - mov ebp,ecx - rol ebp,5 - mov DWORD [8+esp],ebx - lea ebx,[1859775393+eax*1+ebx] - mov eax,DWORD [12+esp] - add ebx,ebp - ; 20_39 35 - mov ebp,ecx - xor eax,DWORD [20+esp] - xor ebp,edx - xor eax,DWORD [44+esp] - xor ebp,edi - xor eax,DWORD [esp] - rol eax,1 - add esi,ebp - ror ecx,2 - mov ebp,ebx - rol ebp,5 - mov DWORD [12+esp],eax - lea eax,[1859775393+esi*1+eax] - mov esi,DWORD [16+esp] - add eax,ebp - ; 20_39 36 - mov ebp,ebx - xor esi,DWORD [24+esp] - xor ebp,ecx - xor esi,DWORD [48+esp] - xor ebp,edx - xor esi,DWORD [4+esp] - rol esi,1 - add edi,ebp - ror ebx,2 - mov ebp,eax - rol ebp,5 - mov DWORD [16+esp],esi - lea esi,[1859775393+edi*1+esi] - mov edi,DWORD [20+esp] - add esi,ebp - ; 20_39 37 - mov ebp,eax - xor edi,DWORD [28+esp] - xor ebp,ebx - xor edi,DWORD [52+esp] - xor ebp,ecx - xor edi,DWORD [8+esp] - rol edi,1 - add edx,ebp - ror eax,2 - mov ebp,esi - rol ebp,5 - mov DWORD [20+esp],edi - lea edi,[1859775393+edx*1+edi] - mov edx,DWORD [24+esp] - add edi,ebp - ; 20_39 38 - mov ebp,esi - xor edx,DWORD [32+esp] - xor ebp,eax - xor edx,DWORD [56+esp] - xor ebp,ebx - xor edx,DWORD [12+esp] - rol edx,1 - add ecx,ebp - ror esi,2 - mov ebp,edi - rol ebp,5 - mov DWORD [24+esp],edx - lea edx,[1859775393+ecx*1+edx] - mov ecx,DWORD [28+esp] - add edx,ebp - ; 20_39 39 - mov ebp,edi - xor ecx,DWORD [36+esp] - xor ebp,esi - xor ecx,DWORD [60+esp] - xor ebp,eax - xor ecx,DWORD [16+esp] - rol ecx,1 - add ebx,ebp - ror edi,2 - mov ebp,edx - rol ebp,5 - mov DWORD [28+esp],ecx - lea ecx,[1859775393+ebx*1+ecx] - mov ebx,DWORD [32+esp] - add ecx,ebp - ; 40_59 40 - mov ebp,edi - xor ebx,DWORD [40+esp] - xor ebp,esi - xor ebx,DWORD [esp] - and ebp,edx - xor ebx,DWORD [20+esp] - rol ebx,1 - add ebp,eax - ror edx,2 - mov eax,ecx - rol eax,5 - mov DWORD [32+esp],ebx - lea ebx,[2400959708+ebp*1+ebx] - mov ebp,edi - add ebx,eax - and ebp,esi - mov eax,DWORD [36+esp] - add ebx,ebp - ; 40_59 41 - mov ebp,edx - xor eax,DWORD [44+esp] - xor ebp,edi - xor eax,DWORD [4+esp] - and ebp,ecx - xor eax,DWORD [24+esp] - rol eax,1 - add ebp,esi - ror ecx,2 - mov esi,ebx - rol esi,5 - mov DWORD [36+esp],eax - lea eax,[2400959708+ebp*1+eax] - mov ebp,edx - add eax,esi - and ebp,edi - mov esi,DWORD [40+esp] - add eax,ebp - ; 40_59 42 - mov ebp,ecx - xor esi,DWORD [48+esp] - xor ebp,edx - xor esi,DWORD [8+esp] - and ebp,ebx - xor esi,DWORD [28+esp] - rol esi,1 - add ebp,edi - ror ebx,2 - mov edi,eax - rol edi,5 - mov DWORD [40+esp],esi - lea esi,[2400959708+ebp*1+esi] - mov ebp,ecx - add esi,edi - and ebp,edx - mov edi,DWORD [44+esp] - add esi,ebp - ; 40_59 43 - mov ebp,ebx - xor edi,DWORD [52+esp] - xor ebp,ecx - xor edi,DWORD [12+esp] - and ebp,eax - xor edi,DWORD [32+esp] - rol edi,1 - add ebp,edx - ror eax,2 - mov edx,esi - rol edx,5 - mov DWORD [44+esp],edi - lea edi,[2400959708+ebp*1+edi] - mov ebp,ebx - add edi,edx - and ebp,ecx - mov edx,DWORD [48+esp] - add edi,ebp - ; 40_59 44 - mov ebp,eax - xor edx,DWORD [56+esp] - xor ebp,ebx - xor edx,DWORD [16+esp] - and ebp,esi - xor edx,DWORD [36+esp] - rol edx,1 - add ebp,ecx - ror esi,2 - mov ecx,edi - rol ecx,5 - mov DWORD [48+esp],edx - lea edx,[2400959708+ebp*1+edx] - mov ebp,eax - add edx,ecx - and ebp,ebx - mov ecx,DWORD [52+esp] - add edx,ebp - ; 40_59 45 - mov ebp,esi - xor ecx,DWORD [60+esp] - xor ebp,eax - xor ecx,DWORD [20+esp] - and ebp,edi - xor ecx,DWORD [40+esp] - rol ecx,1 - add ebp,ebx - ror edi,2 - mov ebx,edx - rol ebx,5 - mov DWORD [52+esp],ecx - lea ecx,[2400959708+ebp*1+ecx] - mov ebp,esi - add ecx,ebx - and ebp,eax - mov ebx,DWORD [56+esp] - add ecx,ebp - ; 40_59 46 - mov ebp,edi - xor ebx,DWORD [esp] - xor ebp,esi - xor ebx,DWORD [24+esp] - and ebp,edx - xor ebx,DWORD [44+esp] - rol ebx,1 - add ebp,eax - ror edx,2 - mov eax,ecx - rol eax,5 - mov DWORD [56+esp],ebx - lea ebx,[2400959708+ebp*1+ebx] - mov ebp,edi - add ebx,eax - and ebp,esi - mov eax,DWORD [60+esp] - add ebx,ebp - ; 40_59 47 - mov ebp,edx - xor eax,DWORD [4+esp] - xor ebp,edi - xor eax,DWORD [28+esp] - and ebp,ecx - xor eax,DWORD [48+esp] - rol eax,1 - add ebp,esi - ror ecx,2 - mov esi,ebx - rol esi,5 - mov DWORD [60+esp],eax - lea eax,[2400959708+ebp*1+eax] - mov ebp,edx - add eax,esi - and ebp,edi - mov esi,DWORD [esp] - add eax,ebp - ; 40_59 48 - mov ebp,ecx - xor esi,DWORD [8+esp] - xor ebp,edx - xor esi,DWORD [32+esp] - and ebp,ebx - xor esi,DWORD [52+esp] - rol esi,1 - add ebp,edi - ror ebx,2 - mov edi,eax - rol edi,5 - mov DWORD [esp],esi - lea esi,[2400959708+ebp*1+esi] - mov ebp,ecx - add esi,edi - and ebp,edx - mov edi,DWORD [4+esp] - add esi,ebp - ; 40_59 49 - mov ebp,ebx - xor edi,DWORD [12+esp] - xor ebp,ecx - xor edi,DWORD [36+esp] - and ebp,eax - xor edi,DWORD [56+esp] - rol edi,1 - add ebp,edx - ror eax,2 - mov edx,esi - rol edx,5 - mov DWORD [4+esp],edi - lea edi,[2400959708+ebp*1+edi] - mov ebp,ebx - add edi,edx - and ebp,ecx - mov edx,DWORD [8+esp] - add edi,ebp - ; 40_59 50 - mov ebp,eax - xor edx,DWORD [16+esp] - xor ebp,ebx - xor edx,DWORD [40+esp] - and ebp,esi - xor edx,DWORD [60+esp] - rol edx,1 - add ebp,ecx - ror esi,2 - mov ecx,edi - rol ecx,5 - mov DWORD [8+esp],edx - lea edx,[2400959708+ebp*1+edx] - mov ebp,eax - add edx,ecx - and ebp,ebx - mov ecx,DWORD [12+esp] - add edx,ebp - ; 40_59 51 - mov ebp,esi - xor ecx,DWORD [20+esp] - xor ebp,eax - xor ecx,DWORD [44+esp] - and ebp,edi - xor ecx,DWORD [esp] - rol ecx,1 - add ebp,ebx - ror edi,2 - mov ebx,edx - rol ebx,5 - mov DWORD [12+esp],ecx - lea ecx,[2400959708+ebp*1+ecx] - mov ebp,esi - add ecx,ebx - and ebp,eax - mov ebx,DWORD [16+esp] - add ecx,ebp - ; 40_59 52 - mov ebp,edi - xor ebx,DWORD [24+esp] - xor ebp,esi - xor ebx,DWORD [48+esp] - and ebp,edx - xor ebx,DWORD [4+esp] - rol ebx,1 - add ebp,eax - ror edx,2 - mov eax,ecx - rol eax,5 - mov DWORD [16+esp],ebx - lea ebx,[2400959708+ebp*1+ebx] - mov ebp,edi - add ebx,eax - and ebp,esi - mov eax,DWORD [20+esp] - add ebx,ebp - ; 40_59 53 - mov ebp,edx - xor eax,DWORD [28+esp] - xor ebp,edi - xor eax,DWORD [52+esp] - and ebp,ecx - xor eax,DWORD [8+esp] - rol eax,1 - add ebp,esi - ror ecx,2 - mov esi,ebx - rol esi,5 - mov DWORD [20+esp],eax - lea eax,[2400959708+ebp*1+eax] - mov ebp,edx - add eax,esi - and ebp,edi - mov esi,DWORD [24+esp] - add eax,ebp - ; 40_59 54 - mov ebp,ecx - xor esi,DWORD [32+esp] - xor ebp,edx - xor esi,DWORD [56+esp] - and ebp,ebx - xor esi,DWORD [12+esp] - rol esi,1 - add ebp,edi - ror ebx,2 - mov edi,eax - rol edi,5 - mov DWORD [24+esp],esi - lea esi,[2400959708+ebp*1+esi] - mov ebp,ecx - add esi,edi - and ebp,edx - mov edi,DWORD [28+esp] - add esi,ebp - ; 40_59 55 - mov ebp,ebx - xor edi,DWORD [36+esp] - xor ebp,ecx - xor edi,DWORD [60+esp] - and ebp,eax - xor edi,DWORD [16+esp] - rol edi,1 - add ebp,edx - ror eax,2 - mov edx,esi - rol edx,5 - mov DWORD [28+esp],edi - lea edi,[2400959708+ebp*1+edi] - mov ebp,ebx - add edi,edx - and ebp,ecx - mov edx,DWORD [32+esp] - add edi,ebp - ; 40_59 56 - mov ebp,eax - xor edx,DWORD [40+esp] - xor ebp,ebx - xor edx,DWORD [esp] - and ebp,esi - xor edx,DWORD [20+esp] - rol edx,1 - add ebp,ecx - ror esi,2 - mov ecx,edi - rol ecx,5 - mov DWORD [32+esp],edx - lea edx,[2400959708+ebp*1+edx] - mov ebp,eax - add edx,ecx - and ebp,ebx - mov ecx,DWORD [36+esp] - add edx,ebp - ; 40_59 57 - mov ebp,esi - xor ecx,DWORD [44+esp] - xor ebp,eax - xor ecx,DWORD [4+esp] - and ebp,edi - xor ecx,DWORD [24+esp] - rol ecx,1 - add ebp,ebx - ror edi,2 - mov ebx,edx - rol ebx,5 - mov DWORD [36+esp],ecx - lea ecx,[2400959708+ebp*1+ecx] - mov ebp,esi - add ecx,ebx - and ebp,eax - mov ebx,DWORD [40+esp] - add ecx,ebp - ; 40_59 58 - mov ebp,edi - xor ebx,DWORD [48+esp] - xor ebp,esi - xor ebx,DWORD [8+esp] - and ebp,edx - xor ebx,DWORD [28+esp] - rol ebx,1 - add ebp,eax - ror edx,2 - mov eax,ecx - rol eax,5 - mov DWORD [40+esp],ebx - lea ebx,[2400959708+ebp*1+ebx] - mov ebp,edi - add ebx,eax - and ebp,esi - mov eax,DWORD [44+esp] - add ebx,ebp - ; 40_59 59 - mov ebp,edx - xor eax,DWORD [52+esp] - xor ebp,edi - xor eax,DWORD [12+esp] - and ebp,ecx - xor eax,DWORD [32+esp] - rol eax,1 - add ebp,esi - ror ecx,2 - mov esi,ebx - rol esi,5 - mov DWORD [44+esp],eax - lea eax,[2400959708+ebp*1+eax] - mov ebp,edx - add eax,esi - and ebp,edi - mov esi,DWORD [48+esp] - add eax,ebp - ; 20_39 60 - mov ebp,ebx - xor esi,DWORD [56+esp] - xor ebp,ecx - xor esi,DWORD [16+esp] - xor ebp,edx - xor esi,DWORD [36+esp] - rol esi,1 - add edi,ebp - ror ebx,2 - mov ebp,eax - rol ebp,5 - mov DWORD [48+esp],esi - lea esi,[3395469782+edi*1+esi] - mov edi,DWORD [52+esp] - add esi,ebp - ; 20_39 61 - mov ebp,eax - xor edi,DWORD [60+esp] - xor ebp,ebx - xor edi,DWORD [20+esp] - xor ebp,ecx - xor edi,DWORD [40+esp] - rol edi,1 - add edx,ebp - ror eax,2 - mov ebp,esi - rol ebp,5 - mov DWORD [52+esp],edi - lea edi,[3395469782+edx*1+edi] - mov edx,DWORD [56+esp] - add edi,ebp - ; 20_39 62 - mov ebp,esi - xor edx,DWORD [esp] - xor ebp,eax - xor edx,DWORD [24+esp] - xor ebp,ebx - xor edx,DWORD [44+esp] - rol edx,1 - add ecx,ebp - ror esi,2 - mov ebp,edi - rol ebp,5 - mov DWORD [56+esp],edx - lea edx,[3395469782+ecx*1+edx] - mov ecx,DWORD [60+esp] - add edx,ebp - ; 20_39 63 - mov ebp,edi - xor ecx,DWORD [4+esp] - xor ebp,esi - xor ecx,DWORD [28+esp] - xor ebp,eax - xor ecx,DWORD [48+esp] - rol ecx,1 - add ebx,ebp - ror edi,2 - mov ebp,edx - rol ebp,5 - mov DWORD [60+esp],ecx - lea ecx,[3395469782+ebx*1+ecx] - mov ebx,DWORD [esp] - add ecx,ebp - ; 20_39 64 - mov ebp,edx - xor ebx,DWORD [8+esp] - xor ebp,edi - xor ebx,DWORD [32+esp] - xor ebp,esi - xor ebx,DWORD [52+esp] - rol ebx,1 - add eax,ebp - ror edx,2 - mov ebp,ecx - rol ebp,5 - mov DWORD [esp],ebx - lea ebx,[3395469782+eax*1+ebx] - mov eax,DWORD [4+esp] - add ebx,ebp - ; 20_39 65 - mov ebp,ecx - xor eax,DWORD [12+esp] - xor ebp,edx - xor eax,DWORD [36+esp] - xor ebp,edi - xor eax,DWORD [56+esp] - rol eax,1 - add esi,ebp - ror ecx,2 - mov ebp,ebx - rol ebp,5 - mov DWORD [4+esp],eax - lea eax,[3395469782+esi*1+eax] - mov esi,DWORD [8+esp] - add eax,ebp - ; 20_39 66 - mov ebp,ebx - xor esi,DWORD [16+esp] - xor ebp,ecx - xor esi,DWORD [40+esp] - xor ebp,edx - xor esi,DWORD [60+esp] - rol esi,1 - add edi,ebp - ror ebx,2 - mov ebp,eax - rol ebp,5 - mov DWORD [8+esp],esi - lea esi,[3395469782+edi*1+esi] - mov edi,DWORD [12+esp] - add esi,ebp - ; 20_39 67 - mov ebp,eax - xor edi,DWORD [20+esp] - xor ebp,ebx - xor edi,DWORD [44+esp] - xor ebp,ecx - xor edi,DWORD [esp] - rol edi,1 - add edx,ebp - ror eax,2 - mov ebp,esi - rol ebp,5 - mov DWORD [12+esp],edi - lea edi,[3395469782+edx*1+edi] - mov edx,DWORD [16+esp] - add edi,ebp - ; 20_39 68 - mov ebp,esi - xor edx,DWORD [24+esp] - xor ebp,eax - xor edx,DWORD [48+esp] - xor ebp,ebx - xor edx,DWORD [4+esp] - rol edx,1 - add ecx,ebp - ror esi,2 - mov ebp,edi - rol ebp,5 - mov DWORD [16+esp],edx - lea edx,[3395469782+ecx*1+edx] - mov ecx,DWORD [20+esp] - add edx,ebp - ; 20_39 69 - mov ebp,edi - xor ecx,DWORD [28+esp] - xor ebp,esi - xor ecx,DWORD [52+esp] - xor ebp,eax - xor ecx,DWORD [8+esp] - rol ecx,1 - add ebx,ebp - ror edi,2 - mov ebp,edx - rol ebp,5 - mov DWORD [20+esp],ecx - lea ecx,[3395469782+ebx*1+ecx] - mov ebx,DWORD [24+esp] - add ecx,ebp - ; 20_39 70 - mov ebp,edx - xor ebx,DWORD [32+esp] - xor ebp,edi - xor ebx,DWORD [56+esp] - xor ebp,esi - xor ebx,DWORD [12+esp] - rol ebx,1 - add eax,ebp - ror edx,2 - mov ebp,ecx - rol ebp,5 - mov DWORD [24+esp],ebx - lea ebx,[3395469782+eax*1+ebx] - mov eax,DWORD [28+esp] - add ebx,ebp - ; 20_39 71 - mov ebp,ecx - xor eax,DWORD [36+esp] - xor ebp,edx - xor eax,DWORD [60+esp] - xor ebp,edi - xor eax,DWORD [16+esp] - rol eax,1 - add esi,ebp - ror ecx,2 - mov ebp,ebx - rol ebp,5 - mov DWORD [28+esp],eax - lea eax,[3395469782+esi*1+eax] - mov esi,DWORD [32+esp] - add eax,ebp - ; 20_39 72 - mov ebp,ebx - xor esi,DWORD [40+esp] - xor ebp,ecx - xor esi,DWORD [esp] - xor ebp,edx - xor esi,DWORD [20+esp] - rol esi,1 - add edi,ebp - ror ebx,2 - mov ebp,eax - rol ebp,5 - mov DWORD [32+esp],esi - lea esi,[3395469782+edi*1+esi] - mov edi,DWORD [36+esp] - add esi,ebp - ; 20_39 73 - mov ebp,eax - xor edi,DWORD [44+esp] - xor ebp,ebx - xor edi,DWORD [4+esp] - xor ebp,ecx - xor edi,DWORD [24+esp] - rol edi,1 - add edx,ebp - ror eax,2 - mov ebp,esi - rol ebp,5 - mov DWORD [36+esp],edi - lea edi,[3395469782+edx*1+edi] - mov edx,DWORD [40+esp] - add edi,ebp - ; 20_39 74 - mov ebp,esi - xor edx,DWORD [48+esp] - xor ebp,eax - xor edx,DWORD [8+esp] - xor ebp,ebx - xor edx,DWORD [28+esp] - rol edx,1 - add ecx,ebp - ror esi,2 - mov ebp,edi - rol ebp,5 - mov DWORD [40+esp],edx - lea edx,[3395469782+ecx*1+edx] - mov ecx,DWORD [44+esp] - add edx,ebp - ; 20_39 75 - mov ebp,edi - xor ecx,DWORD [52+esp] - xor ebp,esi - xor ecx,DWORD [12+esp] - xor ebp,eax - xor ecx,DWORD [32+esp] - rol ecx,1 - add ebx,ebp - ror edi,2 - mov ebp,edx - rol ebp,5 - mov DWORD [44+esp],ecx - lea ecx,[3395469782+ebx*1+ecx] - mov ebx,DWORD [48+esp] - add ecx,ebp - ; 20_39 76 - mov ebp,edx - xor ebx,DWORD [56+esp] - xor ebp,edi - xor ebx,DWORD [16+esp] - xor ebp,esi - xor ebx,DWORD [36+esp] - rol ebx,1 - add eax,ebp - ror edx,2 - mov ebp,ecx - rol ebp,5 - mov DWORD [48+esp],ebx - lea ebx,[3395469782+eax*1+ebx] - mov eax,DWORD [52+esp] - add ebx,ebp - ; 20_39 77 - mov ebp,ecx - xor eax,DWORD [60+esp] - xor ebp,edx - xor eax,DWORD [20+esp] - xor ebp,edi - xor eax,DWORD [40+esp] - rol eax,1 - add esi,ebp - ror ecx,2 - mov ebp,ebx - rol ebp,5 - lea eax,[3395469782+esi*1+eax] - mov esi,DWORD [56+esp] - add eax,ebp - ; 20_39 78 - mov ebp,ebx - xor esi,DWORD [esp] - xor ebp,ecx - xor esi,DWORD [24+esp] - xor ebp,edx - xor esi,DWORD [44+esp] - rol esi,1 - add edi,ebp - ror ebx,2 - mov ebp,eax - rol ebp,5 - lea esi,[3395469782+edi*1+esi] - mov edi,DWORD [60+esp] - add esi,ebp - ; 20_39 79 - mov ebp,eax - xor edi,DWORD [4+esp] - xor ebp,ebx - xor edi,DWORD [28+esp] - xor ebp,ecx - xor edi,DWORD [48+esp] - rol edi,1 - add edx,ebp - ror eax,2 - mov ebp,esi - rol ebp,5 - lea edi,[3395469782+edx*1+edi] - add edi,ebp - mov ebp,DWORD [96+esp] - mov edx,DWORD [100+esp] - add edi,DWORD [ebp] - add esi,DWORD [4+ebp] - add eax,DWORD [8+ebp] - add ebx,DWORD [12+ebp] - add ecx,DWORD [16+ebp] - mov DWORD [ebp],edi - add edx,64 - mov DWORD [4+ebp],esi - cmp edx,DWORD [104+esp] - mov DWORD [8+ebp],eax - mov edi,ecx - mov DWORD [12+ebp],ebx - mov esi,edx - mov DWORD [16+ebp],ecx - jb NEAR L$000loop - add esp,76 - pop edi - pop esi - pop ebx - pop ebp - ret -global _sha1_block_data_order_ssse3 -align 16 -_sha1_block_data_order_ssse3: -L$_sha1_block_data_order_ssse3_begin: - push ebp - push ebx - push esi - push edi - call L$001pic_point -L$001pic_point: - pop ebp - lea ebp,[(L$K_XX_XX-L$001pic_point)+ebp] - movdqa xmm7,[ebp] - movdqa xmm0,[16+ebp] - movdqa xmm1,[32+ebp] - movdqa xmm2,[48+ebp] - movdqa xmm6,[64+ebp] - mov edi,DWORD [20+esp] - mov ebp,DWORD [24+esp] - mov edx,DWORD [28+esp] - mov esi,esp - sub esp,208 - and esp,-64 - movdqa [112+esp],xmm0 - movdqa [128+esp],xmm1 - movdqa [144+esp],xmm2 - shl edx,6 - movdqa [160+esp],xmm7 - add edx,ebp - movdqa [176+esp],xmm6 - add ebp,64 - mov DWORD [192+esp],edi - mov DWORD [196+esp],ebp - mov DWORD [200+esp],edx - mov DWORD [204+esp],esi - mov eax,DWORD [edi] - mov ebx,DWORD [4+edi] - mov ecx,DWORD [8+edi] - mov edx,DWORD [12+edi] - mov edi,DWORD [16+edi] - mov esi,ebx - movdqu xmm0,[ebp-64] - movdqu xmm1,[ebp-48] - movdqu xmm2,[ebp-32] - movdqu xmm3,[ebp-16] -db 102,15,56,0,198 -db 102,15,56,0,206 -db 102,15,56,0,214 - movdqa [96+esp],xmm7 -db 102,15,56,0,222 - paddd xmm0,xmm7 - paddd xmm1,xmm7 - paddd xmm2,xmm7 - movdqa [esp],xmm0 - psubd xmm0,xmm7 - movdqa [16+esp],xmm1 - psubd xmm1,xmm7 - movdqa [32+esp],xmm2 - mov ebp,ecx - psubd xmm2,xmm7 - xor ebp,edx - pshufd xmm4,xmm0,238 - and esi,ebp - jmp NEAR L$002loop -align 16 -L$002loop: - ror ebx,2 - xor esi,edx - mov ebp,eax - punpcklqdq xmm4,xmm1 - movdqa xmm6,xmm3 - add edi,DWORD [esp] - xor ebx,ecx - paddd xmm7,xmm3 - movdqa [64+esp],xmm0 - rol eax,5 - add edi,esi - psrldq xmm6,4 - and ebp,ebx - xor ebx,ecx - pxor xmm4,xmm0 - add edi,eax - ror eax,7 - pxor xmm6,xmm2 - xor ebp,ecx - mov esi,edi - add edx,DWORD [4+esp] - pxor xmm4,xmm6 - xor eax,ebx - rol edi,5 - movdqa [48+esp],xmm7 - add edx,ebp - and esi,eax - movdqa xmm0,xmm4 - xor eax,ebx - add edx,edi - ror edi,7 - movdqa xmm6,xmm4 - xor esi,ebx - pslldq xmm0,12 - paddd xmm4,xmm4 - mov ebp,edx - add ecx,DWORD [8+esp] - psrld xmm6,31 - xor edi,eax - rol edx,5 - movdqa xmm7,xmm0 - add ecx,esi - and ebp,edi - xor edi,eax - psrld xmm0,30 - add ecx,edx - ror edx,7 - por xmm4,xmm6 - xor ebp,eax - mov esi,ecx - add ebx,DWORD [12+esp] - pslld xmm7,2 - xor edx,edi - rol ecx,5 - pxor xmm4,xmm0 - movdqa xmm0,[96+esp] - add ebx,ebp - and esi,edx - pxor xmm4,xmm7 - pshufd xmm5,xmm1,238 - xor edx,edi - add ebx,ecx - ror ecx,7 - xor esi,edi - mov ebp,ebx - punpcklqdq xmm5,xmm2 - movdqa xmm7,xmm4 - add eax,DWORD [16+esp] - xor ecx,edx - paddd xmm0,xmm4 - movdqa [80+esp],xmm1 - rol ebx,5 - add eax,esi - psrldq xmm7,4 - and ebp,ecx - xor ecx,edx - pxor xmm5,xmm1 - add eax,ebx - ror ebx,7 - pxor xmm7,xmm3 - xor ebp,edx - mov esi,eax - add edi,DWORD [20+esp] - pxor xmm5,xmm7 - xor ebx,ecx - rol eax,5 - movdqa [esp],xmm0 - add edi,ebp - and esi,ebx - movdqa xmm1,xmm5 - xor ebx,ecx - add edi,eax - ror eax,7 - movdqa xmm7,xmm5 - xor esi,ecx - pslldq xmm1,12 - paddd xmm5,xmm5 - mov ebp,edi - add edx,DWORD [24+esp] - psrld xmm7,31 - xor eax,ebx - rol edi,5 - movdqa xmm0,xmm1 - add edx,esi - and ebp,eax - xor eax,ebx - psrld xmm1,30 - add edx,edi - ror edi,7 - por xmm5,xmm7 - xor ebp,ebx - mov esi,edx - add ecx,DWORD [28+esp] - pslld xmm0,2 - xor edi,eax - rol edx,5 - pxor xmm5,xmm1 - movdqa xmm1,[112+esp] - add ecx,ebp - and esi,edi - pxor xmm5,xmm0 - pshufd xmm6,xmm2,238 - xor edi,eax - add ecx,edx - ror edx,7 - xor esi,eax - mov ebp,ecx - punpcklqdq xmm6,xmm3 - movdqa xmm0,xmm5 - add ebx,DWORD [32+esp] - xor edx,edi - paddd xmm1,xmm5 - movdqa [96+esp],xmm2 - rol ecx,5 - add ebx,esi - psrldq xmm0,4 - and ebp,edx - xor edx,edi - pxor xmm6,xmm2 - add ebx,ecx - ror ecx,7 - pxor xmm0,xmm4 - xor ebp,edi - mov esi,ebx - add eax,DWORD [36+esp] - pxor xmm6,xmm0 - xor ecx,edx - rol ebx,5 - movdqa [16+esp],xmm1 - add eax,ebp - and esi,ecx - movdqa xmm2,xmm6 - xor ecx,edx - add eax,ebx - ror ebx,7 - movdqa xmm0,xmm6 - xor esi,edx - pslldq xmm2,12 - paddd xmm6,xmm6 - mov ebp,eax - add edi,DWORD [40+esp] - psrld xmm0,31 - xor ebx,ecx - rol eax,5 - movdqa xmm1,xmm2 - add edi,esi - and ebp,ebx - xor ebx,ecx - psrld xmm2,30 - add edi,eax - ror eax,7 - por xmm6,xmm0 - xor ebp,ecx - movdqa xmm0,[64+esp] - mov esi,edi - add edx,DWORD [44+esp] - pslld xmm1,2 - xor eax,ebx - rol edi,5 - pxor xmm6,xmm2 - movdqa xmm2,[112+esp] - add edx,ebp - and esi,eax - pxor xmm6,xmm1 - pshufd xmm7,xmm3,238 - xor eax,ebx - add edx,edi - ror edi,7 - xor esi,ebx - mov ebp,edx - punpcklqdq xmm7,xmm4 - movdqa xmm1,xmm6 - add ecx,DWORD [48+esp] - xor edi,eax - paddd xmm2,xmm6 - movdqa [64+esp],xmm3 - rol edx,5 - add ecx,esi - psrldq xmm1,4 - and ebp,edi - xor edi,eax - pxor xmm7,xmm3 - add ecx,edx - ror edx,7 - pxor xmm1,xmm5 - xor ebp,eax - mov esi,ecx - add ebx,DWORD [52+esp] - pxor xmm7,xmm1 - xor edx,edi - rol ecx,5 - movdqa [32+esp],xmm2 - add ebx,ebp - and esi,edx - movdqa xmm3,xmm7 - xor edx,edi - add ebx,ecx - ror ecx,7 - movdqa xmm1,xmm7 - xor esi,edi - pslldq xmm3,12 - paddd xmm7,xmm7 - mov ebp,ebx - add eax,DWORD [56+esp] - psrld xmm1,31 - xor ecx,edx - rol ebx,5 - movdqa xmm2,xmm3 - add eax,esi - and ebp,ecx - xor ecx,edx - psrld xmm3,30 - add eax,ebx - ror ebx,7 - por xmm7,xmm1 - xor ebp,edx - movdqa xmm1,[80+esp] - mov esi,eax - add edi,DWORD [60+esp] - pslld xmm2,2 - xor ebx,ecx - rol eax,5 - pxor xmm7,xmm3 - movdqa xmm3,[112+esp] - add edi,ebp - and esi,ebx - pxor xmm7,xmm2 - pshufd xmm2,xmm6,238 - xor ebx,ecx - add edi,eax - ror eax,7 - pxor xmm0,xmm4 - punpcklqdq xmm2,xmm7 - xor esi,ecx - mov ebp,edi - add edx,DWORD [esp] - pxor xmm0,xmm1 - movdqa [80+esp],xmm4 - xor eax,ebx - rol edi,5 - movdqa xmm4,xmm3 - add edx,esi - paddd xmm3,xmm7 - and ebp,eax - pxor xmm0,xmm2 - xor eax,ebx - add edx,edi - ror edi,7 - xor ebp,ebx - movdqa xmm2,xmm0 - movdqa [48+esp],xmm3 - mov esi,edx - add ecx,DWORD [4+esp] - xor edi,eax - rol edx,5 - pslld xmm0,2 - add ecx,ebp - and esi,edi - psrld xmm2,30 - xor edi,eax - add ecx,edx - ror edx,7 - xor esi,eax - mov ebp,ecx - add ebx,DWORD [8+esp] - xor edx,edi - rol ecx,5 - por xmm0,xmm2 - add ebx,esi - and ebp,edx - movdqa xmm2,[96+esp] - xor edx,edi - add ebx,ecx - add eax,DWORD [12+esp] - xor ebp,edi - mov esi,ebx - pshufd xmm3,xmm7,238 - rol ebx,5 - add eax,ebp - xor esi,edx - ror ecx,7 - add eax,ebx - add edi,DWORD [16+esp] - pxor xmm1,xmm5 - punpcklqdq xmm3,xmm0 - xor esi,ecx - mov ebp,eax - rol eax,5 - pxor xmm1,xmm2 - movdqa [96+esp],xmm5 - add edi,esi - xor ebp,ecx - movdqa xmm5,xmm4 - ror ebx,7 - paddd xmm4,xmm0 - add edi,eax - pxor xmm1,xmm3 - add edx,DWORD [20+esp] - xor ebp,ebx - mov esi,edi - rol edi,5 - movdqa xmm3,xmm1 - movdqa [esp],xmm4 - add edx,ebp - xor esi,ebx - ror eax,7 - add edx,edi - pslld xmm1,2 - add ecx,DWORD [24+esp] - xor esi,eax - psrld xmm3,30 - mov ebp,edx - rol edx,5 - add ecx,esi - xor ebp,eax - ror edi,7 - add ecx,edx - por xmm1,xmm3 - add ebx,DWORD [28+esp] - xor ebp,edi - movdqa xmm3,[64+esp] - mov esi,ecx - rol ecx,5 - add ebx,ebp - xor esi,edi - ror edx,7 - pshufd xmm4,xmm0,238 - add ebx,ecx - add eax,DWORD [32+esp] - pxor xmm2,xmm6 - punpcklqdq xmm4,xmm1 - xor esi,edx - mov ebp,ebx - rol ebx,5 - pxor xmm2,xmm3 - movdqa [64+esp],xmm6 - add eax,esi - xor ebp,edx - movdqa xmm6,[128+esp] - ror ecx,7 - paddd xmm5,xmm1 - add eax,ebx - pxor xmm2,xmm4 - add edi,DWORD [36+esp] - xor ebp,ecx - mov esi,eax - rol eax,5 - movdqa xmm4,xmm2 - movdqa [16+esp],xmm5 - add edi,ebp - xor esi,ecx - ror ebx,7 - add edi,eax - pslld xmm2,2 - add edx,DWORD [40+esp] - xor esi,ebx - psrld xmm4,30 - mov ebp,edi - rol edi,5 - add edx,esi - xor ebp,ebx - ror eax,7 - add edx,edi - por xmm2,xmm4 - add ecx,DWORD [44+esp] - xor ebp,eax - movdqa xmm4,[80+esp] - mov esi,edx - rol edx,5 - add ecx,ebp - xor esi,eax - ror edi,7 - pshufd xmm5,xmm1,238 - add ecx,edx - add ebx,DWORD [48+esp] - pxor xmm3,xmm7 - punpcklqdq xmm5,xmm2 - xor esi,edi - mov ebp,ecx - rol ecx,5 - pxor xmm3,xmm4 - movdqa [80+esp],xmm7 - add ebx,esi - xor ebp,edi - movdqa xmm7,xmm6 - ror edx,7 - paddd xmm6,xmm2 - add ebx,ecx - pxor xmm3,xmm5 - add eax,DWORD [52+esp] - xor ebp,edx - mov esi,ebx - rol ebx,5 - movdqa xmm5,xmm3 - movdqa [32+esp],xmm6 - add eax,ebp - xor esi,edx - ror ecx,7 - add eax,ebx - pslld xmm3,2 - add edi,DWORD [56+esp] - xor esi,ecx - psrld xmm5,30 - mov ebp,eax - rol eax,5 - add edi,esi - xor ebp,ecx - ror ebx,7 - add edi,eax - por xmm3,xmm5 - add edx,DWORD [60+esp] - xor ebp,ebx - movdqa xmm5,[96+esp] - mov esi,edi - rol edi,5 - add edx,ebp - xor esi,ebx - ror eax,7 - pshufd xmm6,xmm2,238 - add edx,edi - add ecx,DWORD [esp] - pxor xmm4,xmm0 - punpcklqdq xmm6,xmm3 - xor esi,eax - mov ebp,edx - rol edx,5 - pxor xmm4,xmm5 - movdqa [96+esp],xmm0 - add ecx,esi - xor ebp,eax - movdqa xmm0,xmm7 - ror edi,7 - paddd xmm7,xmm3 - add ecx,edx - pxor xmm4,xmm6 - add ebx,DWORD [4+esp] - xor ebp,edi - mov esi,ecx - rol ecx,5 - movdqa xmm6,xmm4 - movdqa [48+esp],xmm7 - add ebx,ebp - xor esi,edi - ror edx,7 - add ebx,ecx - pslld xmm4,2 - add eax,DWORD [8+esp] - xor esi,edx - psrld xmm6,30 - mov ebp,ebx - rol ebx,5 - add eax,esi - xor ebp,edx - ror ecx,7 - add eax,ebx - por xmm4,xmm6 - add edi,DWORD [12+esp] - xor ebp,ecx - movdqa xmm6,[64+esp] - mov esi,eax - rol eax,5 - add edi,ebp - xor esi,ecx - ror ebx,7 - pshufd xmm7,xmm3,238 - add edi,eax - add edx,DWORD [16+esp] - pxor xmm5,xmm1 - punpcklqdq xmm7,xmm4 - xor esi,ebx - mov ebp,edi - rol edi,5 - pxor xmm5,xmm6 - movdqa [64+esp],xmm1 - add edx,esi - xor ebp,ebx - movdqa xmm1,xmm0 - ror eax,7 - paddd xmm0,xmm4 - add edx,edi - pxor xmm5,xmm7 - add ecx,DWORD [20+esp] - xor ebp,eax - mov esi,edx - rol edx,5 - movdqa xmm7,xmm5 - movdqa [esp],xmm0 - add ecx,ebp - xor esi,eax - ror edi,7 - add ecx,edx - pslld xmm5,2 - add ebx,DWORD [24+esp] - xor esi,edi - psrld xmm7,30 - mov ebp,ecx - rol ecx,5 - add ebx,esi - xor ebp,edi - ror edx,7 - add ebx,ecx - por xmm5,xmm7 - add eax,DWORD [28+esp] - movdqa xmm7,[80+esp] - ror ecx,7 - mov esi,ebx - xor ebp,edx - rol ebx,5 - pshufd xmm0,xmm4,238 - add eax,ebp - xor esi,ecx - xor ecx,edx - add eax,ebx - add edi,DWORD [32+esp] - pxor xmm6,xmm2 - punpcklqdq xmm0,xmm5 - and esi,ecx - xor ecx,edx - ror ebx,7 - pxor xmm6,xmm7 - movdqa [80+esp],xmm2 - mov ebp,eax - xor esi,ecx - rol eax,5 - movdqa xmm2,xmm1 - add edi,esi - paddd xmm1,xmm5 - xor ebp,ebx - pxor xmm6,xmm0 - xor ebx,ecx - add edi,eax - add edx,DWORD [36+esp] - and ebp,ebx - movdqa xmm0,xmm6 - movdqa [16+esp],xmm1 - xor ebx,ecx - ror eax,7 - mov esi,edi - xor ebp,ebx - rol edi,5 - pslld xmm6,2 - add edx,ebp - xor esi,eax - psrld xmm0,30 - xor eax,ebx - add edx,edi - add ecx,DWORD [40+esp] - and esi,eax - xor eax,ebx - ror edi,7 - por xmm6,xmm0 - mov ebp,edx - xor esi,eax - movdqa xmm0,[96+esp] - rol edx,5 - add ecx,esi - xor ebp,edi - xor edi,eax - add ecx,edx - pshufd xmm1,xmm5,238 - add ebx,DWORD [44+esp] - and ebp,edi - xor edi,eax - ror edx,7 - mov esi,ecx - xor ebp,edi - rol ecx,5 - add ebx,ebp - xor esi,edx - xor edx,edi - add ebx,ecx - add eax,DWORD [48+esp] - pxor xmm7,xmm3 - punpcklqdq xmm1,xmm6 - and esi,edx - xor edx,edi - ror ecx,7 - pxor xmm7,xmm0 - movdqa [96+esp],xmm3 - mov ebp,ebx - xor esi,edx - rol ebx,5 - movdqa xmm3,[144+esp] - add eax,esi - paddd xmm2,xmm6 - xor ebp,ecx - pxor xmm7,xmm1 - xor ecx,edx - add eax,ebx - add edi,DWORD [52+esp] - and ebp,ecx - movdqa xmm1,xmm7 - movdqa [32+esp],xmm2 - xor ecx,edx - ror ebx,7 - mov esi,eax - xor ebp,ecx - rol eax,5 - pslld xmm7,2 - add edi,ebp - xor esi,ebx - psrld xmm1,30 - xor ebx,ecx - add edi,eax - add edx,DWORD [56+esp] - and esi,ebx - xor ebx,ecx - ror eax,7 - por xmm7,xmm1 - mov ebp,edi - xor esi,ebx - movdqa xmm1,[64+esp] - rol edi,5 - add edx,esi - xor ebp,eax - xor eax,ebx - add edx,edi - pshufd xmm2,xmm6,238 - add ecx,DWORD [60+esp] - and ebp,eax - xor eax,ebx - ror edi,7 - mov esi,edx - xor ebp,eax - rol edx,5 - add ecx,ebp - xor esi,edi - xor edi,eax - add ecx,edx - add ebx,DWORD [esp] - pxor xmm0,xmm4 - punpcklqdq xmm2,xmm7 - and esi,edi - xor edi,eax - ror edx,7 - pxor xmm0,xmm1 - movdqa [64+esp],xmm4 - mov ebp,ecx - xor esi,edi - rol ecx,5 - movdqa xmm4,xmm3 - add ebx,esi - paddd xmm3,xmm7 - xor ebp,edx - pxor xmm0,xmm2 - xor edx,edi - add ebx,ecx - add eax,DWORD [4+esp] - and ebp,edx - movdqa xmm2,xmm0 - movdqa [48+esp],xmm3 - xor edx,edi - ror ecx,7 - mov esi,ebx - xor ebp,edx - rol ebx,5 - pslld xmm0,2 - add eax,ebp - xor esi,ecx - psrld xmm2,30 - xor ecx,edx - add eax,ebx - add edi,DWORD [8+esp] - and esi,ecx - xor ecx,edx - ror ebx,7 - por xmm0,xmm2 - mov ebp,eax - xor esi,ecx - movdqa xmm2,[80+esp] - rol eax,5 - add edi,esi - xor ebp,ebx - xor ebx,ecx - add edi,eax - pshufd xmm3,xmm7,238 - add edx,DWORD [12+esp] - and ebp,ebx - xor ebx,ecx - ror eax,7 - mov esi,edi - xor ebp,ebx - rol edi,5 - add edx,ebp - xor esi,eax - xor eax,ebx - add edx,edi - add ecx,DWORD [16+esp] - pxor xmm1,xmm5 - punpcklqdq xmm3,xmm0 - and esi,eax - xor eax,ebx - ror edi,7 - pxor xmm1,xmm2 - movdqa [80+esp],xmm5 - mov ebp,edx - xor esi,eax - rol edx,5 - movdqa xmm5,xmm4 - add ecx,esi - paddd xmm4,xmm0 - xor ebp,edi - pxor xmm1,xmm3 - xor edi,eax - add ecx,edx - add ebx,DWORD [20+esp] - and ebp,edi - movdqa xmm3,xmm1 - movdqa [esp],xmm4 - xor edi,eax - ror edx,7 - mov esi,ecx - xor ebp,edi - rol ecx,5 - pslld xmm1,2 - add ebx,ebp - xor esi,edx - psrld xmm3,30 - xor edx,edi - add ebx,ecx - add eax,DWORD [24+esp] - and esi,edx - xor edx,edi - ror ecx,7 - por xmm1,xmm3 - mov ebp,ebx - xor esi,edx - movdqa xmm3,[96+esp] - rol ebx,5 - add eax,esi - xor ebp,ecx - xor ecx,edx - add eax,ebx - pshufd xmm4,xmm0,238 - add edi,DWORD [28+esp] - and ebp,ecx - xor ecx,edx - ror ebx,7 - mov esi,eax - xor ebp,ecx - rol eax,5 - add edi,ebp - xor esi,ebx - xor ebx,ecx - add edi,eax - add edx,DWORD [32+esp] - pxor xmm2,xmm6 - punpcklqdq xmm4,xmm1 - and esi,ebx - xor ebx,ecx - ror eax,7 - pxor xmm2,xmm3 - movdqa [96+esp],xmm6 - mov ebp,edi - xor esi,ebx - rol edi,5 - movdqa xmm6,xmm5 - add edx,esi - paddd xmm5,xmm1 - xor ebp,eax - pxor xmm2,xmm4 - xor eax,ebx - add edx,edi - add ecx,DWORD [36+esp] - and ebp,eax - movdqa xmm4,xmm2 - movdqa [16+esp],xmm5 - xor eax,ebx - ror edi,7 - mov esi,edx - xor ebp,eax - rol edx,5 - pslld xmm2,2 - add ecx,ebp - xor esi,edi - psrld xmm4,30 - xor edi,eax - add ecx,edx - add ebx,DWORD [40+esp] - and esi,edi - xor edi,eax - ror edx,7 - por xmm2,xmm4 - mov ebp,ecx - xor esi,edi - movdqa xmm4,[64+esp] - rol ecx,5 - add ebx,esi - xor ebp,edx - xor edx,edi - add ebx,ecx - pshufd xmm5,xmm1,238 - add eax,DWORD [44+esp] - and ebp,edx - xor edx,edi - ror ecx,7 - mov esi,ebx - xor ebp,edx - rol ebx,5 - add eax,ebp - xor esi,edx - add eax,ebx - add edi,DWORD [48+esp] - pxor xmm3,xmm7 - punpcklqdq xmm5,xmm2 - xor esi,ecx - mov ebp,eax - rol eax,5 - pxor xmm3,xmm4 - movdqa [64+esp],xmm7 - add edi,esi - xor ebp,ecx - movdqa xmm7,xmm6 - ror ebx,7 - paddd xmm6,xmm2 - add edi,eax - pxor xmm3,xmm5 - add edx,DWORD [52+esp] - xor ebp,ebx - mov esi,edi - rol edi,5 - movdqa xmm5,xmm3 - movdqa [32+esp],xmm6 - add edx,ebp - xor esi,ebx - ror eax,7 - add edx,edi - pslld xmm3,2 - add ecx,DWORD [56+esp] - xor esi,eax - psrld xmm5,30 - mov ebp,edx - rol edx,5 - add ecx,esi - xor ebp,eax - ror edi,7 - add ecx,edx - por xmm3,xmm5 - add ebx,DWORD [60+esp] - xor ebp,edi - mov esi,ecx - rol ecx,5 - add ebx,ebp - xor esi,edi - ror edx,7 - add ebx,ecx - add eax,DWORD [esp] - xor esi,edx - mov ebp,ebx - rol ebx,5 - add eax,esi - xor ebp,edx - ror ecx,7 - paddd xmm7,xmm3 - add eax,ebx - add edi,DWORD [4+esp] - xor ebp,ecx - mov esi,eax - movdqa [48+esp],xmm7 - rol eax,5 - add edi,ebp - xor esi,ecx - ror ebx,7 - add edi,eax - add edx,DWORD [8+esp] - xor esi,ebx - mov ebp,edi - rol edi,5 - add edx,esi - xor ebp,ebx - ror eax,7 - add edx,edi - add ecx,DWORD [12+esp] - xor ebp,eax - mov esi,edx - rol edx,5 - add ecx,ebp - xor esi,eax - ror edi,7 - add ecx,edx - mov ebp,DWORD [196+esp] - cmp ebp,DWORD [200+esp] - je NEAR L$003done - movdqa xmm7,[160+esp] - movdqa xmm6,[176+esp] - movdqu xmm0,[ebp] - movdqu xmm1,[16+ebp] - movdqu xmm2,[32+ebp] - movdqu xmm3,[48+ebp] - add ebp,64 -db 102,15,56,0,198 - mov DWORD [196+esp],ebp - movdqa [96+esp],xmm7 - add ebx,DWORD [16+esp] - xor esi,edi - mov ebp,ecx - rol ecx,5 - add ebx,esi - xor ebp,edi - ror edx,7 -db 102,15,56,0,206 - add ebx,ecx - add eax,DWORD [20+esp] - xor ebp,edx - mov esi,ebx - paddd xmm0,xmm7 - rol ebx,5 - add eax,ebp - xor esi,edx - ror ecx,7 - movdqa [esp],xmm0 - add eax,ebx - add edi,DWORD [24+esp] - xor esi,ecx - mov ebp,eax - psubd xmm0,xmm7 - rol eax,5 - add edi,esi - xor ebp,ecx - ror ebx,7 - add edi,eax - add edx,DWORD [28+esp] - xor ebp,ebx - mov esi,edi - rol edi,5 - add edx,ebp - xor esi,ebx - ror eax,7 - add edx,edi - add ecx,DWORD [32+esp] - xor esi,eax - mov ebp,edx - rol edx,5 - add ecx,esi - xor ebp,eax - ror edi,7 -db 102,15,56,0,214 - add ecx,edx - add ebx,DWORD [36+esp] - xor ebp,edi - mov esi,ecx - paddd xmm1,xmm7 - rol ecx,5 - add ebx,ebp - xor esi,edi - ror edx,7 - movdqa [16+esp],xmm1 - add ebx,ecx - add eax,DWORD [40+esp] - xor esi,edx - mov ebp,ebx - psubd xmm1,xmm7 - rol ebx,5 - add eax,esi - xor ebp,edx - ror ecx,7 - add eax,ebx - add edi,DWORD [44+esp] - xor ebp,ecx - mov esi,eax - rol eax,5 - add edi,ebp - xor esi,ecx - ror ebx,7 - add edi,eax - add edx,DWORD [48+esp] - xor esi,ebx - mov ebp,edi - rol edi,5 - add edx,esi - xor ebp,ebx - ror eax,7 -db 102,15,56,0,222 - add edx,edi - add ecx,DWORD [52+esp] - xor ebp,eax - mov esi,edx - paddd xmm2,xmm7 - rol edx,5 - add ecx,ebp - xor esi,eax - ror edi,7 - movdqa [32+esp],xmm2 - add ecx,edx - add ebx,DWORD [56+esp] - xor esi,edi - mov ebp,ecx - psubd xmm2,xmm7 - rol ecx,5 - add ebx,esi - xor ebp,edi - ror edx,7 - add ebx,ecx - add eax,DWORD [60+esp] - xor ebp,edx - mov esi,ebx - rol ebx,5 - add eax,ebp - ror ecx,7 - add eax,ebx - mov ebp,DWORD [192+esp] - add eax,DWORD [ebp] - add esi,DWORD [4+ebp] - add ecx,DWORD [8+ebp] - mov DWORD [ebp],eax - add edx,DWORD [12+ebp] - mov DWORD [4+ebp],esi - add edi,DWORD [16+ebp] - mov DWORD [8+ebp],ecx - mov ebx,ecx - mov DWORD [12+ebp],edx - xor ebx,edx - mov DWORD [16+ebp],edi - mov ebp,esi - pshufd xmm4,xmm0,238 - and esi,ebx - mov ebx,ebp - jmp NEAR L$002loop -align 16 -L$003done: - add ebx,DWORD [16+esp] - xor esi,edi - mov ebp,ecx - rol ecx,5 - add ebx,esi - xor ebp,edi - ror edx,7 - add ebx,ecx - add eax,DWORD [20+esp] - xor ebp,edx - mov esi,ebx - rol ebx,5 - add eax,ebp - xor esi,edx - ror ecx,7 - add eax,ebx - add edi,DWORD [24+esp] - xor esi,ecx - mov ebp,eax - rol eax,5 - add edi,esi - xor ebp,ecx - ror ebx,7 - add edi,eax - add edx,DWORD [28+esp] - xor ebp,ebx - mov esi,edi - rol edi,5 - add edx,ebp - xor esi,ebx - ror eax,7 - add edx,edi - add ecx,DWORD [32+esp] - xor esi,eax - mov ebp,edx - rol edx,5 - add ecx,esi - xor ebp,eax - ror edi,7 - add ecx,edx - add ebx,DWORD [36+esp] - xor ebp,edi - mov esi,ecx - rol ecx,5 - add ebx,ebp - xor esi,edi - ror edx,7 - add ebx,ecx - add eax,DWORD [40+esp] - xor esi,edx - mov ebp,ebx - rol ebx,5 - add eax,esi - xor ebp,edx - ror ecx,7 - add eax,ebx - add edi,DWORD [44+esp] - xor ebp,ecx - mov esi,eax - rol eax,5 - add edi,ebp - xor esi,ecx - ror ebx,7 - add edi,eax - add edx,DWORD [48+esp] - xor esi,ebx - mov ebp,edi - rol edi,5 - add edx,esi - xor ebp,ebx - ror eax,7 - add edx,edi - add ecx,DWORD [52+esp] - xor ebp,eax - mov esi,edx - rol edx,5 - add ecx,ebp - xor esi,eax - ror edi,7 - add ecx,edx - add ebx,DWORD [56+esp] - xor esi,edi - mov ebp,ecx - rol ecx,5 - add ebx,esi - xor ebp,edi - ror edx,7 - add ebx,ecx - add eax,DWORD [60+esp] - xor ebp,edx - mov esi,ebx - rol ebx,5 - add eax,ebp - ror ecx,7 - add eax,ebx - mov ebp,DWORD [192+esp] - add eax,DWORD [ebp] - mov esp,DWORD [204+esp] - add esi,DWORD [4+ebp] - add ecx,DWORD [8+ebp] - mov DWORD [ebp],eax - add edx,DWORD [12+ebp] - mov DWORD [4+ebp],esi - add edi,DWORD [16+ebp] - mov DWORD [8+ebp],ecx - mov DWORD [12+ebp],edx - mov DWORD [16+ebp],edi - pop edi - pop esi - pop ebx - pop ebp - ret -global _sha1_block_data_order_avx -align 16 -_sha1_block_data_order_avx: -L$_sha1_block_data_order_avx_begin: - push ebp - push ebx - push esi - push edi - call L$004pic_point -L$004pic_point: - pop ebp - lea ebp,[(L$K_XX_XX-L$004pic_point)+ebp] - vzeroall - vmovdqa xmm7,[ebp] - vmovdqa xmm0,[16+ebp] - vmovdqa xmm1,[32+ebp] - vmovdqa xmm2,[48+ebp] - vmovdqa xmm6,[64+ebp] - mov edi,DWORD [20+esp] - mov ebp,DWORD [24+esp] - mov edx,DWORD [28+esp] - mov esi,esp - sub esp,208 - and esp,-64 - vmovdqa [112+esp],xmm0 - vmovdqa [128+esp],xmm1 - vmovdqa [144+esp],xmm2 - shl edx,6 - vmovdqa [160+esp],xmm7 - add edx,ebp - vmovdqa [176+esp],xmm6 - add ebp,64 - mov DWORD [192+esp],edi - mov DWORD [196+esp],ebp - mov DWORD [200+esp],edx - mov DWORD [204+esp],esi - mov eax,DWORD [edi] - mov ebx,DWORD [4+edi] - mov ecx,DWORD [8+edi] - mov edx,DWORD [12+edi] - mov edi,DWORD [16+edi] - mov esi,ebx - vmovdqu xmm0,[ebp-64] - vmovdqu xmm1,[ebp-48] - vmovdqu xmm2,[ebp-32] - vmovdqu xmm3,[ebp-16] - vpshufb xmm0,xmm0,xmm6 - vpshufb xmm1,xmm1,xmm6 - vpshufb xmm2,xmm2,xmm6 - vmovdqa [96+esp],xmm7 - vpshufb xmm3,xmm3,xmm6 - vpaddd xmm4,xmm0,xmm7 - vpaddd xmm5,xmm1,xmm7 - vpaddd xmm6,xmm2,xmm7 - vmovdqa [esp],xmm4 - mov ebp,ecx - vmovdqa [16+esp],xmm5 - xor ebp,edx - vmovdqa [32+esp],xmm6 - and esi,ebp - jmp NEAR L$005loop -align 16 -L$005loop: - shrd ebx,ebx,2 - xor esi,edx - vpalignr xmm4,xmm1,xmm0,8 - mov ebp,eax - add edi,DWORD [esp] - vpaddd xmm7,xmm7,xmm3 - vmovdqa [64+esp],xmm0 - xor ebx,ecx - shld eax,eax,5 - vpsrldq xmm6,xmm3,4 - add edi,esi - and ebp,ebx - vpxor xmm4,xmm4,xmm0 - xor ebx,ecx - add edi,eax - vpxor xmm6,xmm6,xmm2 - shrd eax,eax,7 - xor ebp,ecx - vmovdqa [48+esp],xmm7 - mov esi,edi - add edx,DWORD [4+esp] - vpxor xmm4,xmm4,xmm6 - xor eax,ebx - shld edi,edi,5 - add edx,ebp - and esi,eax - vpsrld xmm6,xmm4,31 - xor eax,ebx - add edx,edi - shrd edi,edi,7 - xor esi,ebx - vpslldq xmm0,xmm4,12 - vpaddd xmm4,xmm4,xmm4 - mov ebp,edx - add ecx,DWORD [8+esp] - xor edi,eax - shld edx,edx,5 - vpsrld xmm7,xmm0,30 - vpor xmm4,xmm4,xmm6 - add ecx,esi - and ebp,edi - xor edi,eax - add ecx,edx - vpslld xmm0,xmm0,2 - shrd edx,edx,7 - xor ebp,eax - vpxor xmm4,xmm4,xmm7 - mov esi,ecx - add ebx,DWORD [12+esp] - xor edx,edi - shld ecx,ecx,5 - vpxor xmm4,xmm4,xmm0 - add ebx,ebp - and esi,edx - vmovdqa xmm0,[96+esp] - xor edx,edi - add ebx,ecx - shrd ecx,ecx,7 - xor esi,edi - vpalignr xmm5,xmm2,xmm1,8 - mov ebp,ebx - add eax,DWORD [16+esp] - vpaddd xmm0,xmm0,xmm4 - vmovdqa [80+esp],xmm1 - xor ecx,edx - shld ebx,ebx,5 - vpsrldq xmm7,xmm4,4 - add eax,esi - and ebp,ecx - vpxor xmm5,xmm5,xmm1 - xor ecx,edx - add eax,ebx - vpxor xmm7,xmm7,xmm3 - shrd ebx,ebx,7 - xor ebp,edx - vmovdqa [esp],xmm0 - mov esi,eax - add edi,DWORD [20+esp] - vpxor xmm5,xmm5,xmm7 - xor ebx,ecx - shld eax,eax,5 - add edi,ebp - and esi,ebx - vpsrld xmm7,xmm5,31 - xor ebx,ecx - add edi,eax - shrd eax,eax,7 - xor esi,ecx - vpslldq xmm1,xmm5,12 - vpaddd xmm5,xmm5,xmm5 - mov ebp,edi - add edx,DWORD [24+esp] - xor eax,ebx - shld edi,edi,5 - vpsrld xmm0,xmm1,30 - vpor xmm5,xmm5,xmm7 - add edx,esi - and ebp,eax - xor eax,ebx - add edx,edi - vpslld xmm1,xmm1,2 - shrd edi,edi,7 - xor ebp,ebx - vpxor xmm5,xmm5,xmm0 - mov esi,edx - add ecx,DWORD [28+esp] - xor edi,eax - shld edx,edx,5 - vpxor xmm5,xmm5,xmm1 - add ecx,ebp - and esi,edi - vmovdqa xmm1,[112+esp] - xor edi,eax - add ecx,edx - shrd edx,edx,7 - xor esi,eax - vpalignr xmm6,xmm3,xmm2,8 - mov ebp,ecx - add ebx,DWORD [32+esp] - vpaddd xmm1,xmm1,xmm5 - vmovdqa [96+esp],xmm2 - xor edx,edi - shld ecx,ecx,5 - vpsrldq xmm0,xmm5,4 - add ebx,esi - and ebp,edx - vpxor xmm6,xmm6,xmm2 - xor edx,edi - add ebx,ecx - vpxor xmm0,xmm0,xmm4 - shrd ecx,ecx,7 - xor ebp,edi - vmovdqa [16+esp],xmm1 - mov esi,ebx - add eax,DWORD [36+esp] - vpxor xmm6,xmm6,xmm0 - xor ecx,edx - shld ebx,ebx,5 - add eax,ebp - and esi,ecx - vpsrld xmm0,xmm6,31 - xor ecx,edx - add eax,ebx - shrd ebx,ebx,7 - xor esi,edx - vpslldq xmm2,xmm6,12 - vpaddd xmm6,xmm6,xmm6 - mov ebp,eax - add edi,DWORD [40+esp] - xor ebx,ecx - shld eax,eax,5 - vpsrld xmm1,xmm2,30 - vpor xmm6,xmm6,xmm0 - add edi,esi - and ebp,ebx - xor ebx,ecx - add edi,eax - vpslld xmm2,xmm2,2 - vmovdqa xmm0,[64+esp] - shrd eax,eax,7 - xor ebp,ecx - vpxor xmm6,xmm6,xmm1 - mov esi,edi - add edx,DWORD [44+esp] - xor eax,ebx - shld edi,edi,5 - vpxor xmm6,xmm6,xmm2 - add edx,ebp - and esi,eax - vmovdqa xmm2,[112+esp] - xor eax,ebx - add edx,edi - shrd edi,edi,7 - xor esi,ebx - vpalignr xmm7,xmm4,xmm3,8 - mov ebp,edx - add ecx,DWORD [48+esp] - vpaddd xmm2,xmm2,xmm6 - vmovdqa [64+esp],xmm3 - xor edi,eax - shld edx,edx,5 - vpsrldq xmm1,xmm6,4 - add ecx,esi - and ebp,edi - vpxor xmm7,xmm7,xmm3 - xor edi,eax - add ecx,edx - vpxor xmm1,xmm1,xmm5 - shrd edx,edx,7 - xor ebp,eax - vmovdqa [32+esp],xmm2 - mov esi,ecx - add ebx,DWORD [52+esp] - vpxor xmm7,xmm7,xmm1 - xor edx,edi - shld ecx,ecx,5 - add ebx,ebp - and esi,edx - vpsrld xmm1,xmm7,31 - xor edx,edi - add ebx,ecx - shrd ecx,ecx,7 - xor esi,edi - vpslldq xmm3,xmm7,12 - vpaddd xmm7,xmm7,xmm7 - mov ebp,ebx - add eax,DWORD [56+esp] - xor ecx,edx - shld ebx,ebx,5 - vpsrld xmm2,xmm3,30 - vpor xmm7,xmm7,xmm1 - add eax,esi - and ebp,ecx - xor ecx,edx - add eax,ebx - vpslld xmm3,xmm3,2 - vmovdqa xmm1,[80+esp] - shrd ebx,ebx,7 - xor ebp,edx - vpxor xmm7,xmm7,xmm2 - mov esi,eax - add edi,DWORD [60+esp] - xor ebx,ecx - shld eax,eax,5 - vpxor xmm7,xmm7,xmm3 - add edi,ebp - and esi,ebx - vmovdqa xmm3,[112+esp] - xor ebx,ecx - add edi,eax - vpalignr xmm2,xmm7,xmm6,8 - vpxor xmm0,xmm0,xmm4 - shrd eax,eax,7 - xor esi,ecx - mov ebp,edi - add edx,DWORD [esp] - vpxor xmm0,xmm0,xmm1 - vmovdqa [80+esp],xmm4 - xor eax,ebx - shld edi,edi,5 - vmovdqa xmm4,xmm3 - vpaddd xmm3,xmm3,xmm7 - add edx,esi - and ebp,eax - vpxor xmm0,xmm0,xmm2 - xor eax,ebx - add edx,edi - shrd edi,edi,7 - xor ebp,ebx - vpsrld xmm2,xmm0,30 - vmovdqa [48+esp],xmm3 - mov esi,edx - add ecx,DWORD [4+esp] - xor edi,eax - shld edx,edx,5 - vpslld xmm0,xmm0,2 - add ecx,ebp - and esi,edi - xor edi,eax - add ecx,edx - shrd edx,edx,7 - xor esi,eax - mov ebp,ecx - add ebx,DWORD [8+esp] - vpor xmm0,xmm0,xmm2 - xor edx,edi - shld ecx,ecx,5 - vmovdqa xmm2,[96+esp] - add ebx,esi - and ebp,edx - xor edx,edi - add ebx,ecx - add eax,DWORD [12+esp] - xor ebp,edi - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - vpalignr xmm3,xmm0,xmm7,8 - vpxor xmm1,xmm1,xmm5 - add edi,DWORD [16+esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - vpxor xmm1,xmm1,xmm2 - vmovdqa [96+esp],xmm5 - add edi,esi - xor ebp,ecx - vmovdqa xmm5,xmm4 - vpaddd xmm4,xmm4,xmm0 - shrd ebx,ebx,7 - add edi,eax - vpxor xmm1,xmm1,xmm3 - add edx,DWORD [20+esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - vpsrld xmm3,xmm1,30 - vmovdqa [esp],xmm4 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - vpslld xmm1,xmm1,2 - add ecx,DWORD [24+esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - vpor xmm1,xmm1,xmm3 - add ebx,DWORD [28+esp] - xor ebp,edi - vmovdqa xmm3,[64+esp] - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - vpalignr xmm4,xmm1,xmm0,8 - vpxor xmm2,xmm2,xmm6 - add eax,DWORD [32+esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - vpxor xmm2,xmm2,xmm3 - vmovdqa [64+esp],xmm6 - add eax,esi - xor ebp,edx - vmovdqa xmm6,[128+esp] - vpaddd xmm5,xmm5,xmm1 - shrd ecx,ecx,7 - add eax,ebx - vpxor xmm2,xmm2,xmm4 - add edi,DWORD [36+esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - vpsrld xmm4,xmm2,30 - vmovdqa [16+esp],xmm5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - vpslld xmm2,xmm2,2 - add edx,DWORD [40+esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - vpor xmm2,xmm2,xmm4 - add ecx,DWORD [44+esp] - xor ebp,eax - vmovdqa xmm4,[80+esp] - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - vpalignr xmm5,xmm2,xmm1,8 - vpxor xmm3,xmm3,xmm7 - add ebx,DWORD [48+esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - vpxor xmm3,xmm3,xmm4 - vmovdqa [80+esp],xmm7 - add ebx,esi - xor ebp,edi - vmovdqa xmm7,xmm6 - vpaddd xmm6,xmm6,xmm2 - shrd edx,edx,7 - add ebx,ecx - vpxor xmm3,xmm3,xmm5 - add eax,DWORD [52+esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - vpsrld xmm5,xmm3,30 - vmovdqa [32+esp],xmm6 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - vpslld xmm3,xmm3,2 - add edi,DWORD [56+esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - add edi,esi - xor ebp,ecx - shrd ebx,ebx,7 - add edi,eax - vpor xmm3,xmm3,xmm5 - add edx,DWORD [60+esp] - xor ebp,ebx - vmovdqa xmm5,[96+esp] - mov esi,edi - shld edi,edi,5 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - vpalignr xmm6,xmm3,xmm2,8 - vpxor xmm4,xmm4,xmm0 - add ecx,DWORD [esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - vpxor xmm4,xmm4,xmm5 - vmovdqa [96+esp],xmm0 - add ecx,esi - xor ebp,eax - vmovdqa xmm0,xmm7 - vpaddd xmm7,xmm7,xmm3 - shrd edi,edi,7 - add ecx,edx - vpxor xmm4,xmm4,xmm6 - add ebx,DWORD [4+esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - vpsrld xmm6,xmm4,30 - vmovdqa [48+esp],xmm7 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - vpslld xmm4,xmm4,2 - add eax,DWORD [8+esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - vpor xmm4,xmm4,xmm6 - add edi,DWORD [12+esp] - xor ebp,ecx - vmovdqa xmm6,[64+esp] - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - vpalignr xmm7,xmm4,xmm3,8 - vpxor xmm5,xmm5,xmm1 - add edx,DWORD [16+esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - vpxor xmm5,xmm5,xmm6 - vmovdqa [64+esp],xmm1 - add edx,esi - xor ebp,ebx - vmovdqa xmm1,xmm0 - vpaddd xmm0,xmm0,xmm4 - shrd eax,eax,7 - add edx,edi - vpxor xmm5,xmm5,xmm7 - add ecx,DWORD [20+esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - vpsrld xmm7,xmm5,30 - vmovdqa [esp],xmm0 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - vpslld xmm5,xmm5,2 - add ebx,DWORD [24+esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - vpor xmm5,xmm5,xmm7 - add eax,DWORD [28+esp] - vmovdqa xmm7,[80+esp] - shrd ecx,ecx,7 - mov esi,ebx - xor ebp,edx - shld ebx,ebx,5 - add eax,ebp - xor esi,ecx - xor ecx,edx - add eax,ebx - vpalignr xmm0,xmm5,xmm4,8 - vpxor xmm6,xmm6,xmm2 - add edi,DWORD [32+esp] - and esi,ecx - xor ecx,edx - shrd ebx,ebx,7 - vpxor xmm6,xmm6,xmm7 - vmovdqa [80+esp],xmm2 - mov ebp,eax - xor esi,ecx - vmovdqa xmm2,xmm1 - vpaddd xmm1,xmm1,xmm5 - shld eax,eax,5 - add edi,esi - vpxor xmm6,xmm6,xmm0 - xor ebp,ebx - xor ebx,ecx - add edi,eax - add edx,DWORD [36+esp] - vpsrld xmm0,xmm6,30 - vmovdqa [16+esp],xmm1 - and ebp,ebx - xor ebx,ecx - shrd eax,eax,7 - mov esi,edi - vpslld xmm6,xmm6,2 - xor ebp,ebx - shld edi,edi,5 - add edx,ebp - xor esi,eax - xor eax,ebx - add edx,edi - add ecx,DWORD [40+esp] - and esi,eax - vpor xmm6,xmm6,xmm0 - xor eax,ebx - shrd edi,edi,7 - vmovdqa xmm0,[96+esp] - mov ebp,edx - xor esi,eax - shld edx,edx,5 - add ecx,esi - xor ebp,edi - xor edi,eax - add ecx,edx - add ebx,DWORD [44+esp] - and ebp,edi - xor edi,eax - shrd edx,edx,7 - mov esi,ecx - xor ebp,edi - shld ecx,ecx,5 - add ebx,ebp - xor esi,edx - xor edx,edi - add ebx,ecx - vpalignr xmm1,xmm6,xmm5,8 - vpxor xmm7,xmm7,xmm3 - add eax,DWORD [48+esp] - and esi,edx - xor edx,edi - shrd ecx,ecx,7 - vpxor xmm7,xmm7,xmm0 - vmovdqa [96+esp],xmm3 - mov ebp,ebx - xor esi,edx - vmovdqa xmm3,[144+esp] - vpaddd xmm2,xmm2,xmm6 - shld ebx,ebx,5 - add eax,esi - vpxor xmm7,xmm7,xmm1 - xor ebp,ecx - xor ecx,edx - add eax,ebx - add edi,DWORD [52+esp] - vpsrld xmm1,xmm7,30 - vmovdqa [32+esp],xmm2 - and ebp,ecx - xor ecx,edx - shrd ebx,ebx,7 - mov esi,eax - vpslld xmm7,xmm7,2 - xor ebp,ecx - shld eax,eax,5 - add edi,ebp - xor esi,ebx - xor ebx,ecx - add edi,eax - add edx,DWORD [56+esp] - and esi,ebx - vpor xmm7,xmm7,xmm1 - xor ebx,ecx - shrd eax,eax,7 - vmovdqa xmm1,[64+esp] - mov ebp,edi - xor esi,ebx - shld edi,edi,5 - add edx,esi - xor ebp,eax - xor eax,ebx - add edx,edi - add ecx,DWORD [60+esp] - and ebp,eax - xor eax,ebx - shrd edi,edi,7 - mov esi,edx - xor ebp,eax - shld edx,edx,5 - add ecx,ebp - xor esi,edi - xor edi,eax - add ecx,edx - vpalignr xmm2,xmm7,xmm6,8 - vpxor xmm0,xmm0,xmm4 - add ebx,DWORD [esp] - and esi,edi - xor edi,eax - shrd edx,edx,7 - vpxor xmm0,xmm0,xmm1 - vmovdqa [64+esp],xmm4 - mov ebp,ecx - xor esi,edi - vmovdqa xmm4,xmm3 - vpaddd xmm3,xmm3,xmm7 - shld ecx,ecx,5 - add ebx,esi - vpxor xmm0,xmm0,xmm2 - xor ebp,edx - xor edx,edi - add ebx,ecx - add eax,DWORD [4+esp] - vpsrld xmm2,xmm0,30 - vmovdqa [48+esp],xmm3 - and ebp,edx - xor edx,edi - shrd ecx,ecx,7 - mov esi,ebx - vpslld xmm0,xmm0,2 - xor ebp,edx - shld ebx,ebx,5 - add eax,ebp - xor esi,ecx - xor ecx,edx - add eax,ebx - add edi,DWORD [8+esp] - and esi,ecx - vpor xmm0,xmm0,xmm2 - xor ecx,edx - shrd ebx,ebx,7 - vmovdqa xmm2,[80+esp] - mov ebp,eax - xor esi,ecx - shld eax,eax,5 - add edi,esi - xor ebp,ebx - xor ebx,ecx - add edi,eax - add edx,DWORD [12+esp] - and ebp,ebx - xor ebx,ecx - shrd eax,eax,7 - mov esi,edi - xor ebp,ebx - shld edi,edi,5 - add edx,ebp - xor esi,eax - xor eax,ebx - add edx,edi - vpalignr xmm3,xmm0,xmm7,8 - vpxor xmm1,xmm1,xmm5 - add ecx,DWORD [16+esp] - and esi,eax - xor eax,ebx - shrd edi,edi,7 - vpxor xmm1,xmm1,xmm2 - vmovdqa [80+esp],xmm5 - mov ebp,edx - xor esi,eax - vmovdqa xmm5,xmm4 - vpaddd xmm4,xmm4,xmm0 - shld edx,edx,5 - add ecx,esi - vpxor xmm1,xmm1,xmm3 - xor ebp,edi - xor edi,eax - add ecx,edx - add ebx,DWORD [20+esp] - vpsrld xmm3,xmm1,30 - vmovdqa [esp],xmm4 - and ebp,edi - xor edi,eax - shrd edx,edx,7 - mov esi,ecx - vpslld xmm1,xmm1,2 - xor ebp,edi - shld ecx,ecx,5 - add ebx,ebp - xor esi,edx - xor edx,edi - add ebx,ecx - add eax,DWORD [24+esp] - and esi,edx - vpor xmm1,xmm1,xmm3 - xor edx,edi - shrd ecx,ecx,7 - vmovdqa xmm3,[96+esp] - mov ebp,ebx - xor esi,edx - shld ebx,ebx,5 - add eax,esi - xor ebp,ecx - xor ecx,edx - add eax,ebx - add edi,DWORD [28+esp] - and ebp,ecx - xor ecx,edx - shrd ebx,ebx,7 - mov esi,eax - xor ebp,ecx - shld eax,eax,5 - add edi,ebp - xor esi,ebx - xor ebx,ecx - add edi,eax - vpalignr xmm4,xmm1,xmm0,8 - vpxor xmm2,xmm2,xmm6 - add edx,DWORD [32+esp] - and esi,ebx - xor ebx,ecx - shrd eax,eax,7 - vpxor xmm2,xmm2,xmm3 - vmovdqa [96+esp],xmm6 - mov ebp,edi - xor esi,ebx - vmovdqa xmm6,xmm5 - vpaddd xmm5,xmm5,xmm1 - shld edi,edi,5 - add edx,esi - vpxor xmm2,xmm2,xmm4 - xor ebp,eax - xor eax,ebx - add edx,edi - add ecx,DWORD [36+esp] - vpsrld xmm4,xmm2,30 - vmovdqa [16+esp],xmm5 - and ebp,eax - xor eax,ebx - shrd edi,edi,7 - mov esi,edx - vpslld xmm2,xmm2,2 - xor ebp,eax - shld edx,edx,5 - add ecx,ebp - xor esi,edi - xor edi,eax - add ecx,edx - add ebx,DWORD [40+esp] - and esi,edi - vpor xmm2,xmm2,xmm4 - xor edi,eax - shrd edx,edx,7 - vmovdqa xmm4,[64+esp] - mov ebp,ecx - xor esi,edi - shld ecx,ecx,5 - add ebx,esi - xor ebp,edx - xor edx,edi - add ebx,ecx - add eax,DWORD [44+esp] - and ebp,edx - xor edx,edi - shrd ecx,ecx,7 - mov esi,ebx - xor ebp,edx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - add eax,ebx - vpalignr xmm5,xmm2,xmm1,8 - vpxor xmm3,xmm3,xmm7 - add edi,DWORD [48+esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - vpxor xmm3,xmm3,xmm4 - vmovdqa [64+esp],xmm7 - add edi,esi - xor ebp,ecx - vmovdqa xmm7,xmm6 - vpaddd xmm6,xmm6,xmm2 - shrd ebx,ebx,7 - add edi,eax - vpxor xmm3,xmm3,xmm5 - add edx,DWORD [52+esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - vpsrld xmm5,xmm3,30 - vmovdqa [32+esp],xmm6 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - vpslld xmm3,xmm3,2 - add ecx,DWORD [56+esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - vpor xmm3,xmm3,xmm5 - add ebx,DWORD [60+esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [esp] - vpaddd xmm7,xmm7,xmm3 - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - vmovdqa [48+esp],xmm7 - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD [4+esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD [8+esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD [12+esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - mov ebp,DWORD [196+esp] - cmp ebp,DWORD [200+esp] - je NEAR L$006done - vmovdqa xmm7,[160+esp] - vmovdqa xmm6,[176+esp] - vmovdqu xmm0,[ebp] - vmovdqu xmm1,[16+ebp] - vmovdqu xmm2,[32+ebp] - vmovdqu xmm3,[48+ebp] - add ebp,64 - vpshufb xmm0,xmm0,xmm6 - mov DWORD [196+esp],ebp - vmovdqa [96+esp],xmm7 - add ebx,DWORD [16+esp] - xor esi,edi - vpshufb xmm1,xmm1,xmm6 - mov ebp,ecx - shld ecx,ecx,5 - vpaddd xmm4,xmm0,xmm7 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - vmovdqa [esp],xmm4 - add eax,DWORD [20+esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD [24+esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - add edi,esi - xor ebp,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD [28+esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD [32+esp] - xor esi,eax - vpshufb xmm2,xmm2,xmm6 - mov ebp,edx - shld edx,edx,5 - vpaddd xmm5,xmm1,xmm7 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - vmovdqa [16+esp],xmm5 - add ebx,DWORD [36+esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [40+esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD [44+esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD [48+esp] - xor esi,ebx - vpshufb xmm3,xmm3,xmm6 - mov ebp,edi - shld edi,edi,5 - vpaddd xmm6,xmm2,xmm7 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - vmovdqa [32+esp],xmm6 - add ecx,DWORD [52+esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - add ebx,DWORD [56+esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [60+esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - shrd ecx,ecx,7 - add eax,ebx - mov ebp,DWORD [192+esp] - add eax,DWORD [ebp] - add esi,DWORD [4+ebp] - add ecx,DWORD [8+ebp] - mov DWORD [ebp],eax - add edx,DWORD [12+ebp] - mov DWORD [4+ebp],esi - add edi,DWORD [16+ebp] - mov ebx,ecx - mov DWORD [8+ebp],ecx - xor ebx,edx - mov DWORD [12+ebp],edx - mov DWORD [16+ebp],edi - mov ebp,esi - and esi,ebx - mov ebx,ebp - jmp NEAR L$005loop -align 16 -L$006done: - add ebx,DWORD [16+esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [20+esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD [24+esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - add edi,esi - xor ebp,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD [28+esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD [32+esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - add ebx,DWORD [36+esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [40+esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD [44+esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD [48+esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD [52+esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - add ebx,DWORD [56+esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [60+esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - shrd ecx,ecx,7 - add eax,ebx - vzeroall - mov ebp,DWORD [192+esp] - add eax,DWORD [ebp] - mov esp,DWORD [204+esp] - add esi,DWORD [4+ebp] - add ecx,DWORD [8+ebp] - mov DWORD [ebp],eax - add edx,DWORD [12+ebp] - mov DWORD [4+ebp],esi - add edi,DWORD [16+ebp] - mov DWORD [8+ebp],ecx - mov DWORD [12+ebp],edx - mov DWORD [16+ebp],edi - pop edi - pop esi - pop ebx - pop ebp - ret -align 64 -L$K_XX_XX: -dd 1518500249,1518500249,1518500249,1518500249 -dd 1859775393,1859775393,1859775393,1859775393 -dd 2400959708,2400959708,2400959708,2400959708 -dd 3395469782,3395469782,3395469782,3395469782 -dd 66051,67438087,134810123,202182159 -db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -db 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 -db 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 -db 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 -db 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-ios.ios.arm.S deleted file mode 100644 index 4d1a2e21..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-ios.ios.arm.S +++ /dev/null @@ -1,1494 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__APPLE__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__) -#include - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -.globl _sha1_block_data_order_nohw -.private_extern _sha1_block_data_order_nohw -#ifdef __thumb2__ -.thumb_func _sha1_block_data_order_nohw -#endif - -.align 5 -_sha1_block_data_order_nohw: - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - ldmia r0,{r3,r4,r5,r6,r7} -Lloop: - ldr r8,LK_00_19 - mov r14,sp - sub sp,sp,#15*4 - mov r5,r5,ror#30 - mov r6,r6,ror#30 - mov r7,r7,ror#30 @ [6] -L_00_15: -#if __ARM_ARCH<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r6,r8,r6,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r4,r5 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r6,r8,r6,ror#2 @ E+=K_00_19 - eor r10,r4,r5 @ F_xx_xx - add r6,r6,r7,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r3,r10,ror#2 - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r6,r6,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r5,r8,r5,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r3,r4 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r5,r8,r5,ror#2 @ E+=K_00_19 - eor r10,r3,r4 @ F_xx_xx - add r5,r5,r6,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r7,r10,ror#2 - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r5,r5,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r4,r8,r4,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r7,r3 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r4,r8,r4,ror#2 @ E+=K_00_19 - eor r10,r7,r3 @ F_xx_xx - add r4,r4,r5,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r6,r10,ror#2 - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r4,r4,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r3,r8,r3,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r6,r7 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r3,r8,r3,ror#2 @ E+=K_00_19 - eor r10,r6,r7 @ F_xx_xx - add r3,r3,r4,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r5,r10,ror#2 - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r3,r3,r10 @ E+=F_00_19(B,C,D) -#if defined(__thumb2__) - mov r12,sp - teq r14,r12 -#else - teq r14,sp -#endif - bne L_00_15 @ [((11+4)*5+2)*3] - sub sp,sp,#25*4 -#if __ARM_ARCH<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - add r6,r6,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - add r5,r5,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - add r4,r4,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - add r3,r3,r10 @ E+=F_00_19(B,C,D) - - ldr r8,LK_20_39 @ [+15+16*4] - cmn sp,#0 @ [+3], clear carry to denote 20_39 -L_20_39_or_60_79: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r4,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_20_39(B,C,D) -#if defined(__thumb2__) - mov r12,sp - teq r14,r12 -#else - teq r14,sp @ preserve carry -#endif - bne L_20_39_or_60_79 @ [+((12+3)*5+2)*4] - bcs L_done @ [+((12+3)*5+2)*4], spare 300 bytes - - ldr r8,LK_40_59 - sub sp,sp,#20*4 @ [+2] -L_40_59: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r4,r10,ror#2 @ F_xx_xx - and r11,r5,r6 @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_40_59(B,C,D) - add r7,r7,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - and r11,r4,r5 @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_40_59(B,C,D) - add r6,r6,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - and r11,r3,r4 @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_40_59(B,C,D) - add r5,r5,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - and r11,r7,r3 @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_40_59(B,C,D) - add r4,r4,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - and r11,r6,r7 @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_40_59(B,C,D) - add r3,r3,r11,ror#2 -#if defined(__thumb2__) - mov r12,sp - teq r14,r12 -#else - teq r14,sp -#endif - bne L_40_59 @ [+((12+5)*5+2)*4] - - ldr r8,LK_60_79 - sub sp,sp,#20*4 - cmp sp,#0 @ set carry to denote 60_79 - b L_20_39_or_60_79 @ [+4], spare 300 bytes -L_done: - add sp,sp,#80*4 @ "deallocate" stack frame - ldmia r0,{r8,r9,r10,r11,r12} - add r3,r8,r3 - add r4,r9,r4 - add r5,r10,r5,ror#2 - add r6,r11,r6,ror#2 - add r7,r12,r7,ror#2 - stmia r0,{r3,r4,r5,r6,r7} - teq r1,r2 - bne Lloop @ [+18], total 1307 - -#if __ARM_ARCH>=5 - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} -#else - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif - - -.align 5 -LK_00_19:.word 0x5a827999 -LK_20_39:.word 0x6ed9eba1 -LK_40_59:.word 0x8f1bbcdc -LK_60_79:.word 0xca62c1d6 -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 5 -#if __ARM_MAX_ARCH__>=7 - - - -.globl _sha1_block_data_order_neon -.private_extern _sha1_block_data_order_neon -#ifdef __thumb2__ -.thumb_func _sha1_block_data_order_neon -#endif -.align 4 -_sha1_block_data_order_neon: - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - @ dmb @ errata #451034 on early Cortex A8 - @ vstmdb sp!,{d8-d15} @ ABI specification says so - mov r14,sp - sub r12,sp,#64 - adr r8,LK_00_19 - bic r12,r12,#15 @ align for 128-bit stores - - ldmia r0,{r3,r4,r5,r6,r7} @ load context - mov sp,r12 @ alloca - - vld1.8 {q0,q1},[r1]! @ handles unaligned - veor q15,q15,q15 - vld1.8 {q2,q3},[r1]! - vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 - vrev32.8 q0,q0 @ yes, even on - vrev32.8 q1,q1 @ big-endian... - vrev32.8 q2,q2 - vadd.i32 q8,q0,q14 - vrev32.8 q3,q3 - vadd.i32 q9,q1,q14 - vst1.32 {q8},[r12,:128]! - vadd.i32 q10,q2,q14 - vst1.32 {q9},[r12,:128]! - vst1.32 {q10},[r12,:128]! - ldr r9,[sp] @ big RAW stall - -Loop_neon: - vext.8 q8,q0,q1,#8 - bic r10,r6,r4 - add r7,r7,r9 - and r11,r5,r4 - vadd.i32 q13,q3,q14 - ldr r9,[sp,#4] - add r7,r7,r3,ror#27 - vext.8 q12,q3,q15,#4 - eor r11,r11,r10 - mov r4,r4,ror#2 - add r7,r7,r11 - veor q8,q8,q0 - bic r10,r5,r3 - add r6,r6,r9 - veor q12,q12,q2 - and r11,r4,r3 - ldr r9,[sp,#8] - veor q12,q12,q8 - add r6,r6,r7,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q13,q15,q12,#4 - bic r10,r4,r7 - add r5,r5,r9 - vadd.i32 q8,q12,q12 - and r11,r3,r7 - ldr r9,[sp,#12] - vsri.32 q8,q12,#31 - add r5,r5,r6,ror#27 - eor r11,r11,r10 - mov r7,r7,ror#2 - vshr.u32 q12,q13,#30 - add r5,r5,r11 - bic r10,r3,r6 - vshl.u32 q13,q13,#2 - add r4,r4,r9 - and r11,r7,r6 - veor q8,q8,q12 - ldr r9,[sp,#16] - add r4,r4,r5,ror#27 - veor q8,q8,q13 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q9,q1,q2,#8 - bic r10,r7,r5 - add r3,r3,r9 - and r11,r6,r5 - vadd.i32 q13,q8,q14 - ldr r9,[sp,#20] - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r4,ror#27 - vext.8 q12,q8,q15,#4 - eor r11,r11,r10 - mov r5,r5,ror#2 - add r3,r3,r11 - veor q9,q9,q1 - bic r10,r6,r4 - add r7,r7,r9 - veor q12,q12,q3 - and r11,r5,r4 - ldr r9,[sp,#24] - veor q12,q12,q9 - add r7,r7,r3,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q13,q15,q12,#4 - bic r10,r5,r3 - add r6,r6,r9 - vadd.i32 q9,q12,q12 - and r11,r4,r3 - ldr r9,[sp,#28] - vsri.32 q9,q12,#31 - add r6,r6,r7,ror#27 - eor r11,r11,r10 - mov r3,r3,ror#2 - vshr.u32 q12,q13,#30 - add r6,r6,r11 - bic r10,r4,r7 - vshl.u32 q13,q13,#2 - add r5,r5,r9 - and r11,r3,r7 - veor q9,q9,q12 - ldr r9,[sp,#32] - add r5,r5,r6,ror#27 - veor q9,q9,q13 - eor r11,r11,r10 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q10,q2,q3,#8 - bic r10,r3,r6 - add r4,r4,r9 - and r11,r7,r6 - vadd.i32 q13,q9,q14 - ldr r9,[sp,#36] - add r4,r4,r5,ror#27 - vext.8 q12,q9,q15,#4 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - veor q10,q10,q2 - bic r10,r7,r5 - add r3,r3,r9 - veor q12,q12,q8 - and r11,r6,r5 - ldr r9,[sp,#40] - veor q12,q12,q10 - add r3,r3,r4,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q13,q15,q12,#4 - bic r10,r6,r4 - add r7,r7,r9 - vadd.i32 q10,q12,q12 - and r11,r5,r4 - ldr r9,[sp,#44] - vsri.32 q10,q12,#31 - add r7,r7,r3,ror#27 - eor r11,r11,r10 - mov r4,r4,ror#2 - vshr.u32 q12,q13,#30 - add r7,r7,r11 - bic r10,r5,r3 - vshl.u32 q13,q13,#2 - add r6,r6,r9 - and r11,r4,r3 - veor q10,q10,q12 - ldr r9,[sp,#48] - add r6,r6,r7,ror#27 - veor q10,q10,q13 - eor r11,r11,r10 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q11,q3,q8,#8 - bic r10,r4,r7 - add r5,r5,r9 - and r11,r3,r7 - vadd.i32 q13,q10,q14 - ldr r9,[sp,#52] - add r5,r5,r6,ror#27 - vext.8 q12,q10,q15,#4 - eor r11,r11,r10 - mov r7,r7,ror#2 - add r5,r5,r11 - veor q11,q11,q3 - bic r10,r3,r6 - add r4,r4,r9 - veor q12,q12,q9 - and r11,r7,r6 - ldr r9,[sp,#56] - veor q12,q12,q11 - add r4,r4,r5,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q13,q15,q12,#4 - bic r10,r7,r5 - add r3,r3,r9 - vadd.i32 q11,q12,q12 - and r11,r6,r5 - ldr r9,[sp,#60] - vsri.32 q11,q12,#31 - add r3,r3,r4,ror#27 - eor r11,r11,r10 - mov r5,r5,ror#2 - vshr.u32 q12,q13,#30 - add r3,r3,r11 - bic r10,r6,r4 - vshl.u32 q13,q13,#2 - add r7,r7,r9 - and r11,r5,r4 - veor q11,q11,q12 - ldr r9,[sp,#0] - add r7,r7,r3,ror#27 - veor q11,q11,q13 - eor r11,r11,r10 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q10,q11,#8 - bic r10,r5,r3 - add r6,r6,r9 - and r11,r4,r3 - veor q0,q0,q8 - ldr r9,[sp,#4] - add r6,r6,r7,ror#27 - veor q0,q0,q1 - eor r11,r11,r10 - mov r3,r3,ror#2 - vadd.i32 q13,q11,q14 - add r6,r6,r11 - bic r10,r4,r7 - veor q12,q12,q0 - add r5,r5,r9 - and r11,r3,r7 - vshr.u32 q0,q12,#30 - ldr r9,[sp,#8] - add r5,r5,r6,ror#27 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - eor r11,r11,r10 - mov r7,r7,ror#2 - vsli.32 q0,q12,#2 - add r5,r5,r11 - bic r10,r3,r6 - add r4,r4,r9 - and r11,r7,r6 - ldr r9,[sp,#12] - add r4,r4,r5,ror#27 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - bic r10,r7,r5 - add r3,r3,r9 - and r11,r6,r5 - ldr r9,[sp,#16] - add r3,r3,r4,ror#27 - eor r11,r11,r10 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q11,q0,#8 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#20] - veor q1,q1,q9 - eor r11,r10,r5 - add r7,r7,r3,ror#27 - veor q1,q1,q2 - mov r4,r4,ror#2 - add r7,r7,r11 - vadd.i32 q13,q0,q14 - eor r10,r3,r5 - add r6,r6,r9 - veor q12,q12,q1 - ldr r9,[sp,#24] - eor r11,r10,r4 - vshr.u32 q1,q12,#30 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - vst1.32 {q13},[r12,:128]! - add r6,r6,r11 - eor r10,r7,r4 - vsli.32 q1,q12,#2 - add r5,r5,r9 - ldr r9,[sp,#28] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#32] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q12,q0,q1,#8 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#36] - veor q2,q2,q10 - eor r11,r10,r6 - add r3,r3,r4,ror#27 - veor q2,q2,q3 - mov r5,r5,ror#2 - add r3,r3,r11 - vadd.i32 q13,q1,q14 - eor r10,r4,r6 - vld1.32 {d28[],d29[]},[r8,:32]! - add r7,r7,r9 - veor q12,q12,q2 - ldr r9,[sp,#40] - eor r11,r10,r5 - vshr.u32 q2,q12,#30 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - vst1.32 {q13},[r12,:128]! - add r7,r7,r11 - eor r10,r3,r5 - vsli.32 q2,q12,#2 - add r6,r6,r9 - ldr r9,[sp,#44] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#48] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q12,q1,q2,#8 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#52] - veor q3,q3,q11 - eor r11,r10,r7 - add r4,r4,r5,ror#27 - veor q3,q3,q8 - mov r6,r6,ror#2 - add r4,r4,r11 - vadd.i32 q13,q2,q14 - eor r10,r5,r7 - add r3,r3,r9 - veor q12,q12,q3 - ldr r9,[sp,#56] - eor r11,r10,r6 - vshr.u32 q3,q12,#30 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - vst1.32 {q13},[r12,:128]! - add r3,r3,r11 - eor r10,r4,r6 - vsli.32 q3,q12,#2 - add r7,r7,r9 - ldr r9,[sp,#60] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#0] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q12,q2,q3,#8 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#4] - veor q8,q8,q0 - eor r11,r10,r3 - add r5,r5,r6,ror#27 - veor q8,q8,q9 - mov r7,r7,ror#2 - add r5,r5,r11 - vadd.i32 q13,q3,q14 - eor r10,r6,r3 - add r4,r4,r9 - veor q12,q12,q8 - ldr r9,[sp,#8] - eor r11,r10,r7 - vshr.u32 q8,q12,#30 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - add r4,r4,r11 - eor r10,r5,r7 - vsli.32 q8,q12,#2 - add r3,r3,r9 - ldr r9,[sp,#12] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#16] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q3,q8,#8 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#20] - veor q9,q9,q1 - eor r11,r10,r4 - add r6,r6,r7,ror#27 - veor q9,q9,q10 - mov r3,r3,ror#2 - add r6,r6,r11 - vadd.i32 q13,q8,q14 - eor r10,r7,r4 - add r5,r5,r9 - veor q12,q12,q9 - ldr r9,[sp,#24] - eor r11,r10,r3 - vshr.u32 q9,q12,#30 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - vst1.32 {q13},[r12,:128]! - add r5,r5,r11 - eor r10,r6,r3 - vsli.32 q9,q12,#2 - add r4,r4,r9 - ldr r9,[sp,#28] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#32] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q8,q9,#8 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#36] - veor q10,q10,q2 - add r7,r7,r3,ror#27 - eor r11,r5,r6 - veor q10,q10,q11 - add r7,r7,r10 - and r11,r11,r4 - vadd.i32 q13,q9,q14 - mov r4,r4,ror#2 - add r7,r7,r11 - veor q12,q12,q10 - add r6,r6,r9 - and r10,r4,r5 - vshr.u32 q10,q12,#30 - ldr r9,[sp,#40] - add r6,r6,r7,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r4,r5 - add r6,r6,r10 - vsli.32 q10,q12,#2 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#44] - add r5,r5,r6,ror#27 - eor r11,r3,r4 - add r5,r5,r10 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#48] - add r4,r4,r5,ror#27 - eor r11,r7,r3 - add r4,r4,r10 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q12,q9,q10,#8 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#52] - veor q11,q11,q3 - add r3,r3,r4,ror#27 - eor r11,r6,r7 - veor q11,q11,q0 - add r3,r3,r10 - and r11,r11,r5 - vadd.i32 q13,q10,q14 - mov r5,r5,ror#2 - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r11 - veor q12,q12,q11 - add r7,r7,r9 - and r10,r5,r6 - vshr.u32 q11,q12,#30 - ldr r9,[sp,#56] - add r7,r7,r3,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r5,r6 - add r7,r7,r10 - vsli.32 q11,q12,#2 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#60] - add r6,r6,r7,ror#27 - eor r11,r4,r5 - add r6,r6,r10 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#0] - add r5,r5,r6,ror#27 - eor r11,r3,r4 - add r5,r5,r10 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q12,q10,q11,#8 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#4] - veor q0,q0,q8 - add r4,r4,r5,ror#27 - eor r11,r7,r3 - veor q0,q0,q1 - add r4,r4,r10 - and r11,r11,r6 - vadd.i32 q13,q11,q14 - mov r6,r6,ror#2 - add r4,r4,r11 - veor q12,q12,q0 - add r3,r3,r9 - and r10,r6,r7 - vshr.u32 q0,q12,#30 - ldr r9,[sp,#8] - add r3,r3,r4,ror#27 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - eor r11,r6,r7 - add r3,r3,r10 - vsli.32 q0,q12,#2 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#12] - add r7,r7,r3,ror#27 - eor r11,r5,r6 - add r7,r7,r10 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#16] - add r6,r6,r7,ror#27 - eor r11,r4,r5 - add r6,r6,r10 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q12,q11,q0,#8 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#20] - veor q1,q1,q9 - add r5,r5,r6,ror#27 - eor r11,r3,r4 - veor q1,q1,q2 - add r5,r5,r10 - and r11,r11,r7 - vadd.i32 q13,q0,q14 - mov r7,r7,ror#2 - add r5,r5,r11 - veor q12,q12,q1 - add r4,r4,r9 - and r10,r7,r3 - vshr.u32 q1,q12,#30 - ldr r9,[sp,#24] - add r4,r4,r5,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r7,r3 - add r4,r4,r10 - vsli.32 q1,q12,#2 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#28] - add r3,r3,r4,ror#27 - eor r11,r6,r7 - add r3,r3,r10 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#32] - add r7,r7,r3,ror#27 - eor r11,r5,r6 - add r7,r7,r10 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q0,q1,#8 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#36] - veor q2,q2,q10 - add r6,r6,r7,ror#27 - eor r11,r4,r5 - veor q2,q2,q3 - add r6,r6,r10 - and r11,r11,r3 - vadd.i32 q13,q1,q14 - mov r3,r3,ror#2 - add r6,r6,r11 - veor q12,q12,q2 - add r5,r5,r9 - and r10,r3,r4 - vshr.u32 q2,q12,#30 - ldr r9,[sp,#40] - add r5,r5,r6,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r3,r4 - add r5,r5,r10 - vsli.32 q2,q12,#2 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#44] - add r4,r4,r5,ror#27 - eor r11,r7,r3 - add r4,r4,r10 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#48] - add r3,r3,r4,ror#27 - eor r11,r6,r7 - add r3,r3,r10 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q1,q2,#8 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#52] - veor q3,q3,q11 - eor r11,r10,r5 - add r7,r7,r3,ror#27 - veor q3,q3,q8 - mov r4,r4,ror#2 - add r7,r7,r11 - vadd.i32 q13,q2,q14 - eor r10,r3,r5 - add r6,r6,r9 - veor q12,q12,q3 - ldr r9,[sp,#56] - eor r11,r10,r4 - vshr.u32 q3,q12,#30 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - vst1.32 {q13},[r12,:128]! - add r6,r6,r11 - eor r10,r7,r4 - vsli.32 q3,q12,#2 - add r5,r5,r9 - ldr r9,[sp,#60] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#0] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - vadd.i32 q13,q3,q14 - eor r10,r5,r7 - add r3,r3,r9 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - teq r1,r2 - sub r8,r8,#16 - it eq - subeq r1,r1,#64 - vld1.8 {q0,q1},[r1]! - ldr r9,[sp,#4] - eor r11,r10,r6 - vld1.8 {q2,q3},[r1]! - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r11 - eor r10,r4,r6 - vrev32.8 q0,q0 - add r7,r7,r9 - ldr r9,[sp,#8] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#12] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#16] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - vrev32.8 q1,q1 - eor r10,r6,r3 - add r4,r4,r9 - vadd.i32 q8,q0,q14 - ldr r9,[sp,#20] - eor r11,r10,r7 - vst1.32 {q8},[r12,:128]! - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#24] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#28] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#32] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - vrev32.8 q2,q2 - eor r10,r7,r4 - add r5,r5,r9 - vadd.i32 q9,q1,q14 - ldr r9,[sp,#36] - eor r11,r10,r3 - vst1.32 {q9},[r12,:128]! - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#40] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#44] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#48] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - vrev32.8 q3,q3 - eor r10,r3,r5 - add r6,r6,r9 - vadd.i32 q10,q2,q14 - ldr r9,[sp,#52] - eor r11,r10,r4 - vst1.32 {q10},[r12,:128]! - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#56] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#60] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - ldmia r0,{r9,r10,r11,r12} @ accumulate context - add r3,r3,r9 - ldr r9,[r0,#16] - add r4,r4,r10 - add r5,r5,r11 - add r6,r6,r12 - it eq - moveq sp,r14 - add r7,r7,r9 - it ne - ldrne r9,[sp] - stmia r0,{r3,r4,r5,r6,r7} - itt ne - addne r12,sp,#3*16 - bne Loop_neon - - @ vldmia sp!,{d8-d15} - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} - -#endif -#if __ARM_MAX_ARCH__>=7 - -# if defined(__thumb2__) -# define INST(a,b,c,d) .byte c,d|0xf,a,b -# else -# define INST(a,b,c,d) .byte a,b,c,d|0x10 -# endif - -.globl _sha1_block_data_order_hw -.private_extern _sha1_block_data_order_hw -#ifdef __thumb2__ -.thumb_func _sha1_block_data_order_hw -#endif -.align 5 -_sha1_block_data_order_hw: - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - - veor q1,q1,q1 - adr r3,LK_00_19 - vld1.32 {q0},[r0]! - vld1.32 {d2[0]},[r0] - sub r0,r0,#16 - vld1.32 {d16[],d17[]},[r3,:32]! - vld1.32 {d18[],d19[]},[r3,:32]! - vld1.32 {d20[],d21[]},[r3,:32]! - vld1.32 {d22[],d23[]},[r3,:32] - -Loop_v8: - vld1.8 {q4,q5},[r1]! - vld1.8 {q6,q7},[r1]! - vrev32.8 q4,q4 - vrev32.8 q5,q5 - - vadd.i32 q12,q8,q4 - vrev32.8 q6,q6 - vmov q14,q0 @ offload - subs r2,r2,#1 - - vadd.i32 q13,q8,q5 - vrev32.8 q7,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0 - INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12 - vadd.i32 q12,q8,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1 - INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 - vadd.i32 q13,q8,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2 - INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 - vadd.i32 q12,q8,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3 - INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 - vadd.i32 q13,q9,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4 - INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 - vadd.i32 q12,q9,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q9,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - vadd.i32 q12,q9,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q9,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - vadd.i32 q12,q10,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q10,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10 - INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 - vadd.i32 q12,q10,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11 - INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 - vadd.i32 q13,q10,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12 - INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 - vadd.i32 q12,q10,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13 - INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 - vadd.i32 q13,q11,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14 - INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 - vadd.i32 q12,q11,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q11,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - vadd.i32 q12,q11,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q11,q7 - - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - - vadd.i32 q1,q1,q2 - vadd.i32 q0,q0,q14 - bne Loop_v8 - - vst1.32 {q0},[r0]! - vst1.32 {d2[0]},[r0] - - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - bx lr @ bx lr - -#endif -#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__) -#endif // defined(__arm__) && defined(__APPLE__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-windows.windows.x86.S deleted file mode 100644 index d00f36c7..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-windows.windows.x86.S +++ /dev/null @@ -1,5608 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -global _sha256_block_data_order_nohw -align 16 -_sha256_block_data_order_nohw: -L$_sha256_block_data_order_nohw_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov ebx,esp - call L$000pic_point -L$000pic_point: - pop ebp - lea ebp,[(L$K256-L$000pic_point)+ebp] - sub esp,16 - and esp,-64 - shl eax,6 - add eax,edi - mov DWORD [esp],esi - mov DWORD [4+esp],edi - mov DWORD [8+esp],eax - mov DWORD [12+esp],ebx -L$001no_xmm: - sub eax,edi - cmp eax,256 - jae NEAR L$002unrolled - jmp NEAR L$003loop -align 16 -L$003loop: - mov eax,DWORD [edi] - mov ebx,DWORD [4+edi] - mov ecx,DWORD [8+edi] - bswap eax - mov edx,DWORD [12+edi] - bswap ebx - push eax - bswap ecx - push ebx - bswap edx - push ecx - push edx - mov eax,DWORD [16+edi] - mov ebx,DWORD [20+edi] - mov ecx,DWORD [24+edi] - bswap eax - mov edx,DWORD [28+edi] - bswap ebx - push eax - bswap ecx - push ebx - bswap edx - push ecx - push edx - mov eax,DWORD [32+edi] - mov ebx,DWORD [36+edi] - mov ecx,DWORD [40+edi] - bswap eax - mov edx,DWORD [44+edi] - bswap ebx - push eax - bswap ecx - push ebx - bswap edx - push ecx - push edx - mov eax,DWORD [48+edi] - mov ebx,DWORD [52+edi] - mov ecx,DWORD [56+edi] - bswap eax - mov edx,DWORD [60+edi] - bswap ebx - push eax - bswap ecx - push ebx - bswap edx - push ecx - push edx - add edi,64 - lea esp,[esp-36] - mov DWORD [104+esp],edi - mov eax,DWORD [esi] - mov ebx,DWORD [4+esi] - mov ecx,DWORD [8+esi] - mov edi,DWORD [12+esi] - mov DWORD [8+esp],ebx - xor ebx,ecx - mov DWORD [12+esp],ecx - mov DWORD [16+esp],edi - mov DWORD [esp],ebx - mov edx,DWORD [16+esi] - mov ebx,DWORD [20+esi] - mov ecx,DWORD [24+esi] - mov edi,DWORD [28+esi] - mov DWORD [24+esp],ebx - mov DWORD [28+esp],ecx - mov DWORD [32+esp],edi -align 16 -L$00400_15: - mov ecx,edx - mov esi,DWORD [24+esp] - ror ecx,14 - mov edi,DWORD [28+esp] - xor ecx,edx - xor esi,edi - mov ebx,DWORD [96+esp] - ror ecx,5 - and esi,edx - mov DWORD [20+esp],edx - xor edx,ecx - add ebx,DWORD [32+esp] - xor esi,edi - ror edx,6 - mov ecx,eax - add ebx,esi - ror ecx,9 - add ebx,edx - mov edi,DWORD [8+esp] - xor ecx,eax - mov DWORD [4+esp],eax - lea esp,[esp-4] - ror ecx,11 - mov esi,DWORD [ebp] - xor ecx,eax - mov edx,DWORD [20+esp] - xor eax,edi - ror ecx,2 - add ebx,esi - mov DWORD [esp],eax - add edx,ebx - and eax,DWORD [4+esp] - add ebx,ecx - xor eax,edi - add ebp,4 - add eax,ebx - cmp esi,3248222580 - jne NEAR L$00400_15 - mov ecx,DWORD [156+esp] - jmp NEAR L$00516_63 -align 16 -L$00516_63: - mov ebx,ecx - mov esi,DWORD [104+esp] - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [160+esp] - shr edi,10 - add ebx,DWORD [124+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [24+esp] - ror ecx,14 - add ebx,edi - mov edi,DWORD [28+esp] - xor ecx,edx - xor esi,edi - mov DWORD [96+esp],ebx - ror ecx,5 - and esi,edx - mov DWORD [20+esp],edx - xor edx,ecx - add ebx,DWORD [32+esp] - xor esi,edi - ror edx,6 - mov ecx,eax - add ebx,esi - ror ecx,9 - add ebx,edx - mov edi,DWORD [8+esp] - xor ecx,eax - mov DWORD [4+esp],eax - lea esp,[esp-4] - ror ecx,11 - mov esi,DWORD [ebp] - xor ecx,eax - mov edx,DWORD [20+esp] - xor eax,edi - ror ecx,2 - add ebx,esi - mov DWORD [esp],eax - add edx,ebx - and eax,DWORD [4+esp] - add ebx,ecx - xor eax,edi - mov ecx,DWORD [156+esp] - add ebp,4 - add eax,ebx - cmp esi,3329325298 - jne NEAR L$00516_63 - mov esi,DWORD [356+esp] - mov ebx,DWORD [8+esp] - mov ecx,DWORD [16+esp] - add eax,DWORD [esi] - add ebx,DWORD [4+esi] - add edi,DWORD [8+esi] - add ecx,DWORD [12+esi] - mov DWORD [esi],eax - mov DWORD [4+esi],ebx - mov DWORD [8+esi],edi - mov DWORD [12+esi],ecx - mov eax,DWORD [24+esp] - mov ebx,DWORD [28+esp] - mov ecx,DWORD [32+esp] - mov edi,DWORD [360+esp] - add edx,DWORD [16+esi] - add eax,DWORD [20+esi] - add ebx,DWORD [24+esi] - add ecx,DWORD [28+esi] - mov DWORD [16+esi],edx - mov DWORD [20+esi],eax - mov DWORD [24+esi],ebx - mov DWORD [28+esi],ecx - lea esp,[356+esp] - sub ebp,256 - cmp edi,DWORD [8+esp] - jb NEAR L$003loop - mov esp,DWORD [12+esp] - pop edi - pop esi - pop ebx - pop ebp - ret -align 64 -L$K256: -dd 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 -dd 66051,67438087,134810123,202182159 -db 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 -db 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 -db 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 -db 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 -db 62,0 -align 16 -L$002unrolled: - lea esp,[esp-96] - mov eax,DWORD [esi] - mov ebp,DWORD [4+esi] - mov ecx,DWORD [8+esi] - mov ebx,DWORD [12+esi] - mov DWORD [4+esp],ebp - xor ebp,ecx - mov DWORD [8+esp],ecx - mov DWORD [12+esp],ebx - mov edx,DWORD [16+esi] - mov ebx,DWORD [20+esi] - mov ecx,DWORD [24+esi] - mov esi,DWORD [28+esi] - mov DWORD [20+esp],ebx - mov DWORD [24+esp],ecx - mov DWORD [28+esp],esi - jmp NEAR L$006grand_loop -align 16 -L$006grand_loop: - mov ebx,DWORD [edi] - mov ecx,DWORD [4+edi] - bswap ebx - mov esi,DWORD [8+edi] - bswap ecx - mov DWORD [32+esp],ebx - bswap esi - mov DWORD [36+esp],ecx - mov DWORD [40+esp],esi - mov ebx,DWORD [12+edi] - mov ecx,DWORD [16+edi] - bswap ebx - mov esi,DWORD [20+edi] - bswap ecx - mov DWORD [44+esp],ebx - bswap esi - mov DWORD [48+esp],ecx - mov DWORD [52+esp],esi - mov ebx,DWORD [24+edi] - mov ecx,DWORD [28+edi] - bswap ebx - mov esi,DWORD [32+edi] - bswap ecx - mov DWORD [56+esp],ebx - bswap esi - mov DWORD [60+esp],ecx - mov DWORD [64+esp],esi - mov ebx,DWORD [36+edi] - mov ecx,DWORD [40+edi] - bswap ebx - mov esi,DWORD [44+edi] - bswap ecx - mov DWORD [68+esp],ebx - bswap esi - mov DWORD [72+esp],ecx - mov DWORD [76+esp],esi - mov ebx,DWORD [48+edi] - mov ecx,DWORD [52+edi] - bswap ebx - mov esi,DWORD [56+edi] - bswap ecx - mov DWORD [80+esp],ebx - bswap esi - mov DWORD [84+esp],ecx - mov DWORD [88+esp],esi - mov ebx,DWORD [60+edi] - add edi,64 - bswap ebx - mov DWORD [100+esp],edi - mov DWORD [92+esp],ebx - mov ecx,edx - mov esi,DWORD [20+esp] - ror edx,14 - mov edi,DWORD [24+esp] - xor edx,ecx - mov ebx,DWORD [32+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - add ebx,DWORD [28+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [4+esp] - xor ecx,eax - mov DWORD [esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[1116352408+edx*1+ebx] - xor ecx,esi - xor ebp,edi - ror ecx,2 - add ebp,edx - add edx,DWORD [12+esp] - add ebp,ecx - mov esi,edx - mov ecx,DWORD [16+esp] - ror edx,14 - mov edi,DWORD [20+esp] - xor edx,esi - mov ebx,DWORD [36+esp] - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [12+esp],esi - xor edx,esi - add ebx,DWORD [24+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [esp] - xor esi,ebp - mov DWORD [28+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[1899447441+edx*1+ebx] - xor esi,ecx - xor eax,edi - ror esi,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,esi - mov ecx,edx - mov esi,DWORD [12+esp] - ror edx,14 - mov edi,DWORD [16+esp] - xor edx,ecx - mov ebx,DWORD [40+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - add ebx,DWORD [20+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [28+esp] - xor ecx,eax - mov DWORD [24+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[3049323471+edx*1+ebx] - xor ecx,esi - xor ebp,edi - ror ecx,2 - add ebp,edx - add edx,DWORD [4+esp] - add ebp,ecx - mov esi,edx - mov ecx,DWORD [8+esp] - ror edx,14 - mov edi,DWORD [12+esp] - xor edx,esi - mov ebx,DWORD [44+esp] - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [4+esp],esi - xor edx,esi - add ebx,DWORD [16+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [24+esp] - xor esi,ebp - mov DWORD [20+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[3921009573+edx*1+ebx] - xor esi,ecx - xor eax,edi - ror esi,2 - add eax,edx - add edx,DWORD [esp] - add eax,esi - mov ecx,edx - mov esi,DWORD [4+esp] - ror edx,14 - mov edi,DWORD [8+esp] - xor edx,ecx - mov ebx,DWORD [48+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - add ebx,DWORD [12+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [20+esp] - xor ecx,eax - mov DWORD [16+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[961987163+edx*1+ebx] - xor ecx,esi - xor ebp,edi - ror ecx,2 - add ebp,edx - add edx,DWORD [28+esp] - add ebp,ecx - mov esi,edx - mov ecx,DWORD [esp] - ror edx,14 - mov edi,DWORD [4+esp] - xor edx,esi - mov ebx,DWORD [52+esp] - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [28+esp],esi - xor edx,esi - add ebx,DWORD [8+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [16+esp] - xor esi,ebp - mov DWORD [12+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[1508970993+edx*1+ebx] - xor esi,ecx - xor eax,edi - ror esi,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,esi - mov ecx,edx - mov esi,DWORD [28+esp] - ror edx,14 - mov edi,DWORD [esp] - xor edx,ecx - mov ebx,DWORD [56+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - add ebx,DWORD [4+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [12+esp] - xor ecx,eax - mov DWORD [8+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[2453635748+edx*1+ebx] - xor ecx,esi - xor ebp,edi - ror ecx,2 - add ebp,edx - add edx,DWORD [20+esp] - add ebp,ecx - mov esi,edx - mov ecx,DWORD [24+esp] - ror edx,14 - mov edi,DWORD [28+esp] - xor edx,esi - mov ebx,DWORD [60+esp] - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [20+esp],esi - xor edx,esi - add ebx,DWORD [esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [8+esp] - xor esi,ebp - mov DWORD [4+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[2870763221+edx*1+ebx] - xor esi,ecx - xor eax,edi - ror esi,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,esi - mov ecx,edx - mov esi,DWORD [20+esp] - ror edx,14 - mov edi,DWORD [24+esp] - xor edx,ecx - mov ebx,DWORD [64+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - add ebx,DWORD [28+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [4+esp] - xor ecx,eax - mov DWORD [esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[3624381080+edx*1+ebx] - xor ecx,esi - xor ebp,edi - ror ecx,2 - add ebp,edx - add edx,DWORD [12+esp] - add ebp,ecx - mov esi,edx - mov ecx,DWORD [16+esp] - ror edx,14 - mov edi,DWORD [20+esp] - xor edx,esi - mov ebx,DWORD [68+esp] - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [12+esp],esi - xor edx,esi - add ebx,DWORD [24+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [esp] - xor esi,ebp - mov DWORD [28+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[310598401+edx*1+ebx] - xor esi,ecx - xor eax,edi - ror esi,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,esi - mov ecx,edx - mov esi,DWORD [12+esp] - ror edx,14 - mov edi,DWORD [16+esp] - xor edx,ecx - mov ebx,DWORD [72+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - add ebx,DWORD [20+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [28+esp] - xor ecx,eax - mov DWORD [24+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[607225278+edx*1+ebx] - xor ecx,esi - xor ebp,edi - ror ecx,2 - add ebp,edx - add edx,DWORD [4+esp] - add ebp,ecx - mov esi,edx - mov ecx,DWORD [8+esp] - ror edx,14 - mov edi,DWORD [12+esp] - xor edx,esi - mov ebx,DWORD [76+esp] - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [4+esp],esi - xor edx,esi - add ebx,DWORD [16+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [24+esp] - xor esi,ebp - mov DWORD [20+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[1426881987+edx*1+ebx] - xor esi,ecx - xor eax,edi - ror esi,2 - add eax,edx - add edx,DWORD [esp] - add eax,esi - mov ecx,edx - mov esi,DWORD [4+esp] - ror edx,14 - mov edi,DWORD [8+esp] - xor edx,ecx - mov ebx,DWORD [80+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - add ebx,DWORD [12+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [20+esp] - xor ecx,eax - mov DWORD [16+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[1925078388+edx*1+ebx] - xor ecx,esi - xor ebp,edi - ror ecx,2 - add ebp,edx - add edx,DWORD [28+esp] - add ebp,ecx - mov esi,edx - mov ecx,DWORD [esp] - ror edx,14 - mov edi,DWORD [4+esp] - xor edx,esi - mov ebx,DWORD [84+esp] - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [28+esp],esi - xor edx,esi - add ebx,DWORD [8+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [16+esp] - xor esi,ebp - mov DWORD [12+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[2162078206+edx*1+ebx] - xor esi,ecx - xor eax,edi - ror esi,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,esi - mov ecx,edx - mov esi,DWORD [28+esp] - ror edx,14 - mov edi,DWORD [esp] - xor edx,ecx - mov ebx,DWORD [88+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - add ebx,DWORD [4+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [12+esp] - xor ecx,eax - mov DWORD [8+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[2614888103+edx*1+ebx] - xor ecx,esi - xor ebp,edi - ror ecx,2 - add ebp,edx - add edx,DWORD [20+esp] - add ebp,ecx - mov esi,edx - mov ecx,DWORD [24+esp] - ror edx,14 - mov edi,DWORD [28+esp] - xor edx,esi - mov ebx,DWORD [92+esp] - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [20+esp],esi - xor edx,esi - add ebx,DWORD [esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [8+esp] - xor esi,ebp - mov DWORD [4+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[3248222580+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [36+esp] - ror esi,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,esi - mov esi,DWORD [88+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [32+esp] - shr edi,10 - add ebx,DWORD [68+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [20+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [24+esp] - xor edx,ecx - mov DWORD [32+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - add ebx,DWORD [28+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [4+esp] - xor ecx,eax - mov DWORD [esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[3835390401+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [40+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [12+esp] - add ebp,ecx - mov ecx,DWORD [92+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [36+esp] - shr edi,10 - add ebx,DWORD [72+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [16+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [20+esp] - xor edx,esi - mov DWORD [36+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [12+esp],esi - xor edx,esi - add ebx,DWORD [24+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [esp] - xor esi,ebp - mov DWORD [28+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[4022224774+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [44+esp] - ror esi,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,esi - mov esi,DWORD [32+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [40+esp] - shr edi,10 - add ebx,DWORD [76+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [12+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [16+esp] - xor edx,ecx - mov DWORD [40+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - add ebx,DWORD [20+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [28+esp] - xor ecx,eax - mov DWORD [24+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[264347078+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [48+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [4+esp] - add ebp,ecx - mov ecx,DWORD [36+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [44+esp] - shr edi,10 - add ebx,DWORD [80+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [8+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [12+esp] - xor edx,esi - mov DWORD [44+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [4+esp],esi - xor edx,esi - add ebx,DWORD [16+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [24+esp] - xor esi,ebp - mov DWORD [20+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[604807628+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [52+esp] - ror esi,2 - add eax,edx - add edx,DWORD [esp] - add eax,esi - mov esi,DWORD [40+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [48+esp] - shr edi,10 - add ebx,DWORD [84+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [4+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [8+esp] - xor edx,ecx - mov DWORD [48+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - add ebx,DWORD [12+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [20+esp] - xor ecx,eax - mov DWORD [16+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[770255983+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [56+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [28+esp] - add ebp,ecx - mov ecx,DWORD [44+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [52+esp] - shr edi,10 - add ebx,DWORD [88+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [4+esp] - xor edx,esi - mov DWORD [52+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [28+esp],esi - xor edx,esi - add ebx,DWORD [8+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [16+esp] - xor esi,ebp - mov DWORD [12+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[1249150122+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [60+esp] - ror esi,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,esi - mov esi,DWORD [48+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [56+esp] - shr edi,10 - add ebx,DWORD [92+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [28+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [esp] - xor edx,ecx - mov DWORD [56+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - add ebx,DWORD [4+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [12+esp] - xor ecx,eax - mov DWORD [8+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[1555081692+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [64+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [20+esp] - add ebp,ecx - mov ecx,DWORD [52+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [60+esp] - shr edi,10 - add ebx,DWORD [32+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [24+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [28+esp] - xor edx,esi - mov DWORD [60+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [20+esp],esi - xor edx,esi - add ebx,DWORD [esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [8+esp] - xor esi,ebp - mov DWORD [4+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[1996064986+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [68+esp] - ror esi,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,esi - mov esi,DWORD [56+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [64+esp] - shr edi,10 - add ebx,DWORD [36+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [20+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [24+esp] - xor edx,ecx - mov DWORD [64+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - add ebx,DWORD [28+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [4+esp] - xor ecx,eax - mov DWORD [esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[2554220882+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [72+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [12+esp] - add ebp,ecx - mov ecx,DWORD [60+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [68+esp] - shr edi,10 - add ebx,DWORD [40+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [16+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [20+esp] - xor edx,esi - mov DWORD [68+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [12+esp],esi - xor edx,esi - add ebx,DWORD [24+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [esp] - xor esi,ebp - mov DWORD [28+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[2821834349+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [76+esp] - ror esi,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,esi - mov esi,DWORD [64+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [72+esp] - shr edi,10 - add ebx,DWORD [44+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [12+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [16+esp] - xor edx,ecx - mov DWORD [72+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - add ebx,DWORD [20+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [28+esp] - xor ecx,eax - mov DWORD [24+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[2952996808+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [80+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [4+esp] - add ebp,ecx - mov ecx,DWORD [68+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [76+esp] - shr edi,10 - add ebx,DWORD [48+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [8+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [12+esp] - xor edx,esi - mov DWORD [76+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [4+esp],esi - xor edx,esi - add ebx,DWORD [16+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [24+esp] - xor esi,ebp - mov DWORD [20+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[3210313671+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [84+esp] - ror esi,2 - add eax,edx - add edx,DWORD [esp] - add eax,esi - mov esi,DWORD [72+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [80+esp] - shr edi,10 - add ebx,DWORD [52+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [4+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [8+esp] - xor edx,ecx - mov DWORD [80+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - add ebx,DWORD [12+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [20+esp] - xor ecx,eax - mov DWORD [16+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[3336571891+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [88+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [28+esp] - add ebp,ecx - mov ecx,DWORD [76+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [84+esp] - shr edi,10 - add ebx,DWORD [56+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [4+esp] - xor edx,esi - mov DWORD [84+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [28+esp],esi - xor edx,esi - add ebx,DWORD [8+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [16+esp] - xor esi,ebp - mov DWORD [12+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[3584528711+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [92+esp] - ror esi,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,esi - mov esi,DWORD [80+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [88+esp] - shr edi,10 - add ebx,DWORD [60+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [28+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [esp] - xor edx,ecx - mov DWORD [88+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - add ebx,DWORD [4+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [12+esp] - xor ecx,eax - mov DWORD [8+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[113926993+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [32+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [20+esp] - add ebp,ecx - mov ecx,DWORD [84+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [92+esp] - shr edi,10 - add ebx,DWORD [64+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [24+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [28+esp] - xor edx,esi - mov DWORD [92+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [20+esp],esi - xor edx,esi - add ebx,DWORD [esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [8+esp] - xor esi,ebp - mov DWORD [4+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[338241895+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [36+esp] - ror esi,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,esi - mov esi,DWORD [88+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [32+esp] - shr edi,10 - add ebx,DWORD [68+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [20+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [24+esp] - xor edx,ecx - mov DWORD [32+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - add ebx,DWORD [28+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [4+esp] - xor ecx,eax - mov DWORD [esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[666307205+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [40+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [12+esp] - add ebp,ecx - mov ecx,DWORD [92+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [36+esp] - shr edi,10 - add ebx,DWORD [72+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [16+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [20+esp] - xor edx,esi - mov DWORD [36+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [12+esp],esi - xor edx,esi - add ebx,DWORD [24+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [esp] - xor esi,ebp - mov DWORD [28+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[773529912+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [44+esp] - ror esi,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,esi - mov esi,DWORD [32+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [40+esp] - shr edi,10 - add ebx,DWORD [76+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [12+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [16+esp] - xor edx,ecx - mov DWORD [40+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - add ebx,DWORD [20+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [28+esp] - xor ecx,eax - mov DWORD [24+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[1294757372+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [48+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [4+esp] - add ebp,ecx - mov ecx,DWORD [36+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [44+esp] - shr edi,10 - add ebx,DWORD [80+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [8+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [12+esp] - xor edx,esi - mov DWORD [44+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [4+esp],esi - xor edx,esi - add ebx,DWORD [16+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [24+esp] - xor esi,ebp - mov DWORD [20+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[1396182291+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [52+esp] - ror esi,2 - add eax,edx - add edx,DWORD [esp] - add eax,esi - mov esi,DWORD [40+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [48+esp] - shr edi,10 - add ebx,DWORD [84+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [4+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [8+esp] - xor edx,ecx - mov DWORD [48+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - add ebx,DWORD [12+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [20+esp] - xor ecx,eax - mov DWORD [16+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[1695183700+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [56+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [28+esp] - add ebp,ecx - mov ecx,DWORD [44+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [52+esp] - shr edi,10 - add ebx,DWORD [88+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [4+esp] - xor edx,esi - mov DWORD [52+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [28+esp],esi - xor edx,esi - add ebx,DWORD [8+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [16+esp] - xor esi,ebp - mov DWORD [12+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[1986661051+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [60+esp] - ror esi,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,esi - mov esi,DWORD [48+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [56+esp] - shr edi,10 - add ebx,DWORD [92+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [28+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [esp] - xor edx,ecx - mov DWORD [56+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - add ebx,DWORD [4+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [12+esp] - xor ecx,eax - mov DWORD [8+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[2177026350+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [64+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [20+esp] - add ebp,ecx - mov ecx,DWORD [52+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [60+esp] - shr edi,10 - add ebx,DWORD [32+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [24+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [28+esp] - xor edx,esi - mov DWORD [60+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [20+esp],esi - xor edx,esi - add ebx,DWORD [esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [8+esp] - xor esi,ebp - mov DWORD [4+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[2456956037+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [68+esp] - ror esi,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,esi - mov esi,DWORD [56+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [64+esp] - shr edi,10 - add ebx,DWORD [36+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [20+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [24+esp] - xor edx,ecx - mov DWORD [64+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - add ebx,DWORD [28+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [4+esp] - xor ecx,eax - mov DWORD [esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[2730485921+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [72+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [12+esp] - add ebp,ecx - mov ecx,DWORD [60+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [68+esp] - shr edi,10 - add ebx,DWORD [40+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [16+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [20+esp] - xor edx,esi - mov DWORD [68+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [12+esp],esi - xor edx,esi - add ebx,DWORD [24+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [esp] - xor esi,ebp - mov DWORD [28+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[2820302411+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [76+esp] - ror esi,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,esi - mov esi,DWORD [64+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [72+esp] - shr edi,10 - add ebx,DWORD [44+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [12+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [16+esp] - xor edx,ecx - mov DWORD [72+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - add ebx,DWORD [20+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [28+esp] - xor ecx,eax - mov DWORD [24+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[3259730800+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [80+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [4+esp] - add ebp,ecx - mov ecx,DWORD [68+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [76+esp] - shr edi,10 - add ebx,DWORD [48+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [8+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [12+esp] - xor edx,esi - mov DWORD [76+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [4+esp],esi - xor edx,esi - add ebx,DWORD [16+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [24+esp] - xor esi,ebp - mov DWORD [20+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[3345764771+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [84+esp] - ror esi,2 - add eax,edx - add edx,DWORD [esp] - add eax,esi - mov esi,DWORD [72+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [80+esp] - shr edi,10 - add ebx,DWORD [52+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [4+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [8+esp] - xor edx,ecx - mov DWORD [80+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - add ebx,DWORD [12+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [20+esp] - xor ecx,eax - mov DWORD [16+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[3516065817+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [88+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [28+esp] - add ebp,ecx - mov ecx,DWORD [76+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [84+esp] - shr edi,10 - add ebx,DWORD [56+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [4+esp] - xor edx,esi - mov DWORD [84+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [28+esp],esi - xor edx,esi - add ebx,DWORD [8+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [16+esp] - xor esi,ebp - mov DWORD [12+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[3600352804+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [92+esp] - ror esi,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,esi - mov esi,DWORD [80+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [88+esp] - shr edi,10 - add ebx,DWORD [60+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [28+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [esp] - xor edx,ecx - mov DWORD [88+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - add ebx,DWORD [4+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [12+esp] - xor ecx,eax - mov DWORD [8+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[4094571909+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [32+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [20+esp] - add ebp,ecx - mov ecx,DWORD [84+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [92+esp] - shr edi,10 - add ebx,DWORD [64+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [24+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [28+esp] - xor edx,esi - mov DWORD [92+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [20+esp],esi - xor edx,esi - add ebx,DWORD [esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [8+esp] - xor esi,ebp - mov DWORD [4+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[275423344+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [36+esp] - ror esi,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,esi - mov esi,DWORD [88+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [32+esp] - shr edi,10 - add ebx,DWORD [68+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [20+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [24+esp] - xor edx,ecx - mov DWORD [32+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - add ebx,DWORD [28+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [4+esp] - xor ecx,eax - mov DWORD [esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[430227734+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [40+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [12+esp] - add ebp,ecx - mov ecx,DWORD [92+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [36+esp] - shr edi,10 - add ebx,DWORD [72+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [16+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [20+esp] - xor edx,esi - mov DWORD [36+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [12+esp],esi - xor edx,esi - add ebx,DWORD [24+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [esp] - xor esi,ebp - mov DWORD [28+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[506948616+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [44+esp] - ror esi,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,esi - mov esi,DWORD [32+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [40+esp] - shr edi,10 - add ebx,DWORD [76+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [12+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [16+esp] - xor edx,ecx - mov DWORD [40+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - add ebx,DWORD [20+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [28+esp] - xor ecx,eax - mov DWORD [24+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[659060556+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [48+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [4+esp] - add ebp,ecx - mov ecx,DWORD [36+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [44+esp] - shr edi,10 - add ebx,DWORD [80+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [8+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [12+esp] - xor edx,esi - mov DWORD [44+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [4+esp],esi - xor edx,esi - add ebx,DWORD [16+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [24+esp] - xor esi,ebp - mov DWORD [20+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[883997877+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [52+esp] - ror esi,2 - add eax,edx - add edx,DWORD [esp] - add eax,esi - mov esi,DWORD [40+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [48+esp] - shr edi,10 - add ebx,DWORD [84+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [4+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [8+esp] - xor edx,ecx - mov DWORD [48+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - add ebx,DWORD [12+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [20+esp] - xor ecx,eax - mov DWORD [16+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[958139571+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [56+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [28+esp] - add ebp,ecx - mov ecx,DWORD [44+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [52+esp] - shr edi,10 - add ebx,DWORD [88+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [4+esp] - xor edx,esi - mov DWORD [52+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [28+esp],esi - xor edx,esi - add ebx,DWORD [8+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [16+esp] - xor esi,ebp - mov DWORD [12+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[1322822218+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [60+esp] - ror esi,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,esi - mov esi,DWORD [48+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [56+esp] - shr edi,10 - add ebx,DWORD [92+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [28+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [esp] - xor edx,ecx - mov DWORD [56+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - add ebx,DWORD [4+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [12+esp] - xor ecx,eax - mov DWORD [8+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[1537002063+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [64+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [20+esp] - add ebp,ecx - mov ecx,DWORD [52+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [60+esp] - shr edi,10 - add ebx,DWORD [32+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [24+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [28+esp] - xor edx,esi - mov DWORD [60+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [20+esp],esi - xor edx,esi - add ebx,DWORD [esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [8+esp] - xor esi,ebp - mov DWORD [4+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[1747873779+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [68+esp] - ror esi,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,esi - mov esi,DWORD [56+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [64+esp] - shr edi,10 - add ebx,DWORD [36+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [20+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [24+esp] - xor edx,ecx - mov DWORD [64+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - add ebx,DWORD [28+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [4+esp] - xor ecx,eax - mov DWORD [esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[1955562222+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [72+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [12+esp] - add ebp,ecx - mov ecx,DWORD [60+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [68+esp] - shr edi,10 - add ebx,DWORD [40+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [16+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [20+esp] - xor edx,esi - mov DWORD [68+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [12+esp],esi - xor edx,esi - add ebx,DWORD [24+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [esp] - xor esi,ebp - mov DWORD [28+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[2024104815+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [76+esp] - ror esi,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,esi - mov esi,DWORD [64+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [72+esp] - shr edi,10 - add ebx,DWORD [44+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [12+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [16+esp] - xor edx,ecx - mov DWORD [72+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - add ebx,DWORD [20+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [28+esp] - xor ecx,eax - mov DWORD [24+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[2227730452+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [80+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [4+esp] - add ebp,ecx - mov ecx,DWORD [68+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [76+esp] - shr edi,10 - add ebx,DWORD [48+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [8+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [12+esp] - xor edx,esi - mov DWORD [76+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [4+esp],esi - xor edx,esi - add ebx,DWORD [16+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [24+esp] - xor esi,ebp - mov DWORD [20+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[2361852424+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [84+esp] - ror esi,2 - add eax,edx - add edx,DWORD [esp] - add eax,esi - mov esi,DWORD [72+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [80+esp] - shr edi,10 - add ebx,DWORD [52+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [4+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [8+esp] - xor edx,ecx - mov DWORD [80+esp],ebx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - add ebx,DWORD [12+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [20+esp] - xor ecx,eax - mov DWORD [16+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[2428436474+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [88+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [28+esp] - add ebp,ecx - mov ecx,DWORD [76+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [84+esp] - shr edi,10 - add ebx,DWORD [56+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [4+esp] - xor edx,esi - mov DWORD [84+esp],ebx - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [28+esp],esi - xor edx,esi - add ebx,DWORD [8+esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [16+esp] - xor esi,ebp - mov DWORD [12+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[2756734187+edx*1+ebx] - xor esi,ecx - xor eax,edi - mov ecx,DWORD [92+esp] - ror esi,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,esi - mov esi,DWORD [80+esp] - mov ebx,ecx - ror ecx,11 - mov edi,esi - ror esi,2 - xor ecx,ebx - shr ebx,3 - ror ecx,7 - xor esi,edi - xor ebx,ecx - ror esi,17 - add ebx,DWORD [88+esp] - shr edi,10 - add ebx,DWORD [60+esp] - mov ecx,edx - xor edi,esi - mov esi,DWORD [28+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [esp] - xor edx,ecx - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - add ebx,DWORD [4+esp] - xor edi,esi - ror edx,6 - mov ecx,eax - add ebx,edi - ror ecx,9 - mov esi,eax - mov edi,DWORD [12+esp] - xor ecx,eax - mov DWORD [8+esp],eax - xor eax,edi - ror ecx,11 - and ebp,eax - lea edx,[3204031479+edx*1+ebx] - xor ecx,esi - xor ebp,edi - mov esi,DWORD [32+esp] - ror ecx,2 - add ebp,edx - add edx,DWORD [20+esp] - add ebp,ecx - mov ecx,DWORD [84+esp] - mov ebx,esi - ror esi,11 - mov edi,ecx - ror ecx,2 - xor esi,ebx - shr ebx,3 - ror esi,7 - xor ecx,edi - xor ebx,esi - ror ecx,17 - add ebx,DWORD [92+esp] - shr edi,10 - add ebx,DWORD [64+esp] - mov esi,edx - xor edi,ecx - mov ecx,DWORD [24+esp] - ror edx,14 - add ebx,edi - mov edi,DWORD [28+esp] - xor edx,esi - xor ecx,edi - ror edx,5 - and ecx,esi - mov DWORD [20+esp],esi - xor edx,esi - add ebx,DWORD [esp] - xor edi,ecx - ror edx,6 - mov esi,ebp - add ebx,edi - ror esi,9 - mov ecx,ebp - mov edi,DWORD [8+esp] - xor esi,ebp - mov DWORD [4+esp],ebp - xor ebp,edi - ror esi,11 - and eax,ebp - lea edx,[3329325298+edx*1+ebx] - xor esi,ecx - xor eax,edi - ror esi,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,esi - mov esi,DWORD [96+esp] - xor ebp,edi - mov ecx,DWORD [12+esp] - add eax,DWORD [esi] - add ebp,DWORD [4+esi] - add edi,DWORD [8+esi] - add ecx,DWORD [12+esi] - mov DWORD [esi],eax - mov DWORD [4+esi],ebp - mov DWORD [8+esi],edi - mov DWORD [12+esi],ecx - mov DWORD [4+esp],ebp - xor ebp,edi - mov DWORD [8+esp],edi - mov DWORD [12+esp],ecx - mov edi,DWORD [20+esp] - mov ebx,DWORD [24+esp] - mov ecx,DWORD [28+esp] - add edx,DWORD [16+esi] - add edi,DWORD [20+esi] - add ebx,DWORD [24+esi] - add ecx,DWORD [28+esi] - mov DWORD [16+esi],edx - mov DWORD [20+esi],edi - mov DWORD [24+esi],ebx - mov DWORD [28+esi],ecx - mov DWORD [20+esp],edi - mov edi,DWORD [100+esp] - mov DWORD [24+esp],ebx - mov DWORD [28+esp],ecx - cmp edi,DWORD [104+esp] - jb NEAR L$006grand_loop - mov esp,DWORD [108+esp] - pop edi - pop esi - pop ebx - pop ebp - ret -global _sha256_block_data_order_ssse3 -align 16 -_sha256_block_data_order_ssse3: -L$_sha256_block_data_order_ssse3_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov ebx,esp - call L$007pic_point -L$007pic_point: - pop ebp - lea ebp,[(L$K256-L$007pic_point)+ebp] - sub esp,16 - and esp,-64 - shl eax,6 - add eax,edi - mov DWORD [esp],esi - mov DWORD [4+esp],edi - mov DWORD [8+esp],eax - mov DWORD [12+esp],ebx - lea esp,[esp-96] - mov eax,DWORD [esi] - mov ebx,DWORD [4+esi] - mov ecx,DWORD [8+esi] - mov edi,DWORD [12+esi] - mov DWORD [4+esp],ebx - xor ebx,ecx - mov DWORD [8+esp],ecx - mov DWORD [12+esp],edi - mov edx,DWORD [16+esi] - mov edi,DWORD [20+esi] - mov ecx,DWORD [24+esi] - mov esi,DWORD [28+esi] - mov DWORD [20+esp],edi - mov edi,DWORD [100+esp] - mov DWORD [24+esp],ecx - mov DWORD [28+esp],esi - movdqa xmm7,[256+ebp] - jmp NEAR L$008grand_ssse3 -align 16 -L$008grand_ssse3: - movdqu xmm0,[edi] - movdqu xmm1,[16+edi] - movdqu xmm2,[32+edi] - movdqu xmm3,[48+edi] - add edi,64 -db 102,15,56,0,199 - mov DWORD [100+esp],edi -db 102,15,56,0,207 - movdqa xmm4,[ebp] -db 102,15,56,0,215 - movdqa xmm5,[16+ebp] - paddd xmm4,xmm0 -db 102,15,56,0,223 - movdqa xmm6,[32+ebp] - paddd xmm5,xmm1 - movdqa xmm7,[48+ebp] - movdqa [32+esp],xmm4 - paddd xmm6,xmm2 - movdqa [48+esp],xmm5 - paddd xmm7,xmm3 - movdqa [64+esp],xmm6 - movdqa [80+esp],xmm7 - jmp NEAR L$009ssse3_00_47 -align 16 -L$009ssse3_00_47: - add ebp,64 - mov ecx,edx - movdqa xmm4,xmm1 - ror edx,14 - mov esi,DWORD [20+esp] - movdqa xmm7,xmm3 - xor edx,ecx - mov edi,DWORD [24+esp] -db 102,15,58,15,224,4 - xor esi,edi - ror edx,5 - and esi,ecx -db 102,15,58,15,250,4 - mov DWORD [16+esp],ecx - xor edx,ecx - xor edi,esi - movdqa xmm5,xmm4 - ror edx,6 - mov ecx,eax - movdqa xmm6,xmm4 - add edx,edi - mov edi,DWORD [4+esp] - psrld xmm4,3 - mov esi,eax - ror ecx,9 - paddd xmm0,xmm7 - mov DWORD [esp],eax - xor ecx,eax - psrld xmm6,7 - xor eax,edi - add edx,DWORD [28+esp] - ror ecx,11 - and ebx,eax - pshufd xmm7,xmm3,250 - xor ecx,esi - add edx,DWORD [32+esp] - pslld xmm5,14 - xor ebx,edi - ror ecx,2 - pxor xmm4,xmm6 - add ebx,edx - add edx,DWORD [12+esp] - psrld xmm6,11 - add ebx,ecx - mov ecx,edx - ror edx,14 - pxor xmm4,xmm5 - mov esi,DWORD [16+esp] - xor edx,ecx - pslld xmm5,11 - mov edi,DWORD [20+esp] - xor esi,edi - ror edx,5 - pxor xmm4,xmm6 - and esi,ecx - mov DWORD [12+esp],ecx - movdqa xmm6,xmm7 - xor edx,ecx - xor edi,esi - ror edx,6 - pxor xmm4,xmm5 - mov ecx,ebx - add edx,edi - psrld xmm7,10 - mov edi,DWORD [esp] - mov esi,ebx - ror ecx,9 - paddd xmm0,xmm4 - mov DWORD [28+esp],ebx - xor ecx,ebx - psrlq xmm6,17 - xor ebx,edi - add edx,DWORD [24+esp] - ror ecx,11 - pxor xmm7,xmm6 - and eax,ebx - xor ecx,esi - psrlq xmm6,2 - add edx,DWORD [36+esp] - xor eax,edi - ror ecx,2 - pxor xmm7,xmm6 - add eax,edx - add edx,DWORD [8+esp] - pshufd xmm7,xmm7,128 - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [12+esp] - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - ror edx,5 - and esi,ecx - psrldq xmm7,8 - mov DWORD [8+esp],ecx - xor edx,ecx - xor edi,esi - paddd xmm0,xmm7 - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - mov esi,eax - ror ecx,9 - mov DWORD [24+esp],eax - pshufd xmm7,xmm0,80 - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - movdqa xmm6,xmm7 - ror ecx,11 - psrld xmm7,10 - and ebx,eax - psrlq xmm6,17 - xor ecx,esi - add edx,DWORD [40+esp] - xor ebx,edi - ror ecx,2 - pxor xmm7,xmm6 - add ebx,edx - add edx,DWORD [4+esp] - psrlq xmm6,2 - add ebx,ecx - mov ecx,edx - ror edx,14 - pxor xmm7,xmm6 - mov esi,DWORD [8+esp] - xor edx,ecx - mov edi,DWORD [12+esp] - pshufd xmm7,xmm7,8 - xor esi,edi - ror edx,5 - movdqa xmm6,[ebp] - and esi,ecx - mov DWORD [4+esp],ecx - pslldq xmm7,8 - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - ror ecx,9 - paddd xmm0,xmm7 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - paddd xmm6,xmm0 - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [44+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - movdqa [32+esp],xmm6 - mov ecx,edx - movdqa xmm4,xmm2 - ror edx,14 - mov esi,DWORD [4+esp] - movdqa xmm7,xmm0 - xor edx,ecx - mov edi,DWORD [8+esp] -db 102,15,58,15,225,4 - xor esi,edi - ror edx,5 - and esi,ecx -db 102,15,58,15,251,4 - mov DWORD [esp],ecx - xor edx,ecx - xor edi,esi - movdqa xmm5,xmm4 - ror edx,6 - mov ecx,eax - movdqa xmm6,xmm4 - add edx,edi - mov edi,DWORD [20+esp] - psrld xmm4,3 - mov esi,eax - ror ecx,9 - paddd xmm1,xmm7 - mov DWORD [16+esp],eax - xor ecx,eax - psrld xmm6,7 - xor eax,edi - add edx,DWORD [12+esp] - ror ecx,11 - and ebx,eax - pshufd xmm7,xmm0,250 - xor ecx,esi - add edx,DWORD [48+esp] - pslld xmm5,14 - xor ebx,edi - ror ecx,2 - pxor xmm4,xmm6 - add ebx,edx - add edx,DWORD [28+esp] - psrld xmm6,11 - add ebx,ecx - mov ecx,edx - ror edx,14 - pxor xmm4,xmm5 - mov esi,DWORD [esp] - xor edx,ecx - pslld xmm5,11 - mov edi,DWORD [4+esp] - xor esi,edi - ror edx,5 - pxor xmm4,xmm6 - and esi,ecx - mov DWORD [28+esp],ecx - movdqa xmm6,xmm7 - xor edx,ecx - xor edi,esi - ror edx,6 - pxor xmm4,xmm5 - mov ecx,ebx - add edx,edi - psrld xmm7,10 - mov edi,DWORD [16+esp] - mov esi,ebx - ror ecx,9 - paddd xmm1,xmm4 - mov DWORD [12+esp],ebx - xor ecx,ebx - psrlq xmm6,17 - xor ebx,edi - add edx,DWORD [8+esp] - ror ecx,11 - pxor xmm7,xmm6 - and eax,ebx - xor ecx,esi - psrlq xmm6,2 - add edx,DWORD [52+esp] - xor eax,edi - ror ecx,2 - pxor xmm7,xmm6 - add eax,edx - add edx,DWORD [24+esp] - pshufd xmm7,xmm7,128 - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [28+esp] - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - ror edx,5 - and esi,ecx - psrldq xmm7,8 - mov DWORD [24+esp],ecx - xor edx,ecx - xor edi,esi - paddd xmm1,xmm7 - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - mov esi,eax - ror ecx,9 - mov DWORD [8+esp],eax - pshufd xmm7,xmm1,80 - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - movdqa xmm6,xmm7 - ror ecx,11 - psrld xmm7,10 - and ebx,eax - psrlq xmm6,17 - xor ecx,esi - add edx,DWORD [56+esp] - xor ebx,edi - ror ecx,2 - pxor xmm7,xmm6 - add ebx,edx - add edx,DWORD [20+esp] - psrlq xmm6,2 - add ebx,ecx - mov ecx,edx - ror edx,14 - pxor xmm7,xmm6 - mov esi,DWORD [24+esp] - xor edx,ecx - mov edi,DWORD [28+esp] - pshufd xmm7,xmm7,8 - xor esi,edi - ror edx,5 - movdqa xmm6,[16+ebp] - and esi,ecx - mov DWORD [20+esp],ecx - pslldq xmm7,8 - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - ror ecx,9 - paddd xmm1,xmm7 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - paddd xmm6,xmm1 - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [60+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - movdqa [48+esp],xmm6 - mov ecx,edx - movdqa xmm4,xmm3 - ror edx,14 - mov esi,DWORD [20+esp] - movdqa xmm7,xmm1 - xor edx,ecx - mov edi,DWORD [24+esp] -db 102,15,58,15,226,4 - xor esi,edi - ror edx,5 - and esi,ecx -db 102,15,58,15,248,4 - mov DWORD [16+esp],ecx - xor edx,ecx - xor edi,esi - movdqa xmm5,xmm4 - ror edx,6 - mov ecx,eax - movdqa xmm6,xmm4 - add edx,edi - mov edi,DWORD [4+esp] - psrld xmm4,3 - mov esi,eax - ror ecx,9 - paddd xmm2,xmm7 - mov DWORD [esp],eax - xor ecx,eax - psrld xmm6,7 - xor eax,edi - add edx,DWORD [28+esp] - ror ecx,11 - and ebx,eax - pshufd xmm7,xmm1,250 - xor ecx,esi - add edx,DWORD [64+esp] - pslld xmm5,14 - xor ebx,edi - ror ecx,2 - pxor xmm4,xmm6 - add ebx,edx - add edx,DWORD [12+esp] - psrld xmm6,11 - add ebx,ecx - mov ecx,edx - ror edx,14 - pxor xmm4,xmm5 - mov esi,DWORD [16+esp] - xor edx,ecx - pslld xmm5,11 - mov edi,DWORD [20+esp] - xor esi,edi - ror edx,5 - pxor xmm4,xmm6 - and esi,ecx - mov DWORD [12+esp],ecx - movdqa xmm6,xmm7 - xor edx,ecx - xor edi,esi - ror edx,6 - pxor xmm4,xmm5 - mov ecx,ebx - add edx,edi - psrld xmm7,10 - mov edi,DWORD [esp] - mov esi,ebx - ror ecx,9 - paddd xmm2,xmm4 - mov DWORD [28+esp],ebx - xor ecx,ebx - psrlq xmm6,17 - xor ebx,edi - add edx,DWORD [24+esp] - ror ecx,11 - pxor xmm7,xmm6 - and eax,ebx - xor ecx,esi - psrlq xmm6,2 - add edx,DWORD [68+esp] - xor eax,edi - ror ecx,2 - pxor xmm7,xmm6 - add eax,edx - add edx,DWORD [8+esp] - pshufd xmm7,xmm7,128 - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [12+esp] - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - ror edx,5 - and esi,ecx - psrldq xmm7,8 - mov DWORD [8+esp],ecx - xor edx,ecx - xor edi,esi - paddd xmm2,xmm7 - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - mov esi,eax - ror ecx,9 - mov DWORD [24+esp],eax - pshufd xmm7,xmm2,80 - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - movdqa xmm6,xmm7 - ror ecx,11 - psrld xmm7,10 - and ebx,eax - psrlq xmm6,17 - xor ecx,esi - add edx,DWORD [72+esp] - xor ebx,edi - ror ecx,2 - pxor xmm7,xmm6 - add ebx,edx - add edx,DWORD [4+esp] - psrlq xmm6,2 - add ebx,ecx - mov ecx,edx - ror edx,14 - pxor xmm7,xmm6 - mov esi,DWORD [8+esp] - xor edx,ecx - mov edi,DWORD [12+esp] - pshufd xmm7,xmm7,8 - xor esi,edi - ror edx,5 - movdqa xmm6,[32+ebp] - and esi,ecx - mov DWORD [4+esp],ecx - pslldq xmm7,8 - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - ror ecx,9 - paddd xmm2,xmm7 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - paddd xmm6,xmm2 - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [76+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - movdqa [64+esp],xmm6 - mov ecx,edx - movdqa xmm4,xmm0 - ror edx,14 - mov esi,DWORD [4+esp] - movdqa xmm7,xmm2 - xor edx,ecx - mov edi,DWORD [8+esp] -db 102,15,58,15,227,4 - xor esi,edi - ror edx,5 - and esi,ecx -db 102,15,58,15,249,4 - mov DWORD [esp],ecx - xor edx,ecx - xor edi,esi - movdqa xmm5,xmm4 - ror edx,6 - mov ecx,eax - movdqa xmm6,xmm4 - add edx,edi - mov edi,DWORD [20+esp] - psrld xmm4,3 - mov esi,eax - ror ecx,9 - paddd xmm3,xmm7 - mov DWORD [16+esp],eax - xor ecx,eax - psrld xmm6,7 - xor eax,edi - add edx,DWORD [12+esp] - ror ecx,11 - and ebx,eax - pshufd xmm7,xmm2,250 - xor ecx,esi - add edx,DWORD [80+esp] - pslld xmm5,14 - xor ebx,edi - ror ecx,2 - pxor xmm4,xmm6 - add ebx,edx - add edx,DWORD [28+esp] - psrld xmm6,11 - add ebx,ecx - mov ecx,edx - ror edx,14 - pxor xmm4,xmm5 - mov esi,DWORD [esp] - xor edx,ecx - pslld xmm5,11 - mov edi,DWORD [4+esp] - xor esi,edi - ror edx,5 - pxor xmm4,xmm6 - and esi,ecx - mov DWORD [28+esp],ecx - movdqa xmm6,xmm7 - xor edx,ecx - xor edi,esi - ror edx,6 - pxor xmm4,xmm5 - mov ecx,ebx - add edx,edi - psrld xmm7,10 - mov edi,DWORD [16+esp] - mov esi,ebx - ror ecx,9 - paddd xmm3,xmm4 - mov DWORD [12+esp],ebx - xor ecx,ebx - psrlq xmm6,17 - xor ebx,edi - add edx,DWORD [8+esp] - ror ecx,11 - pxor xmm7,xmm6 - and eax,ebx - xor ecx,esi - psrlq xmm6,2 - add edx,DWORD [84+esp] - xor eax,edi - ror ecx,2 - pxor xmm7,xmm6 - add eax,edx - add edx,DWORD [24+esp] - pshufd xmm7,xmm7,128 - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [28+esp] - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - ror edx,5 - and esi,ecx - psrldq xmm7,8 - mov DWORD [24+esp],ecx - xor edx,ecx - xor edi,esi - paddd xmm3,xmm7 - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - mov esi,eax - ror ecx,9 - mov DWORD [8+esp],eax - pshufd xmm7,xmm3,80 - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - movdqa xmm6,xmm7 - ror ecx,11 - psrld xmm7,10 - and ebx,eax - psrlq xmm6,17 - xor ecx,esi - add edx,DWORD [88+esp] - xor ebx,edi - ror ecx,2 - pxor xmm7,xmm6 - add ebx,edx - add edx,DWORD [20+esp] - psrlq xmm6,2 - add ebx,ecx - mov ecx,edx - ror edx,14 - pxor xmm7,xmm6 - mov esi,DWORD [24+esp] - xor edx,ecx - mov edi,DWORD [28+esp] - pshufd xmm7,xmm7,8 - xor esi,edi - ror edx,5 - movdqa xmm6,[48+ebp] - and esi,ecx - mov DWORD [20+esp],ecx - pslldq xmm7,8 - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - ror ecx,9 - paddd xmm3,xmm7 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - paddd xmm6,xmm3 - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [92+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - movdqa [80+esp],xmm6 - cmp DWORD [64+ebp],66051 - jne NEAR L$009ssse3_00_47 - mov ecx,edx - ror edx,14 - mov esi,DWORD [20+esp] - xor edx,ecx - mov edi,DWORD [24+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [4+esp] - mov esi,eax - ror ecx,9 - mov DWORD [esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [28+esp] - ror ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [32+esp] - xor ebx,edi - ror ecx,2 - add ebx,edx - add edx,DWORD [12+esp] - add ebx,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [16+esp] - xor edx,ecx - mov edi,DWORD [20+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [12+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [esp] - mov esi,ebx - ror ecx,9 - mov DWORD [28+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [24+esp] - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [36+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [12+esp] - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - mov esi,eax - ror ecx,9 - mov DWORD [24+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - ror ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [40+esp] - xor ebx,edi - ror ecx,2 - add ebx,edx - add edx,DWORD [4+esp] - add ebx,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [8+esp] - xor edx,ecx - mov edi,DWORD [12+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [4+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - ror ecx,9 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [44+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [4+esp] - xor edx,ecx - mov edi,DWORD [8+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [20+esp] - mov esi,eax - ror ecx,9 - mov DWORD [16+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [12+esp] - ror ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [48+esp] - xor ebx,edi - ror ecx,2 - add ebx,edx - add edx,DWORD [28+esp] - add ebx,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [esp] - xor edx,ecx - mov edi,DWORD [4+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [28+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [16+esp] - mov esi,ebx - ror ecx,9 - mov DWORD [12+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [8+esp] - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [52+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [28+esp] - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - mov esi,eax - ror ecx,9 - mov DWORD [8+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - ror ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [56+esp] - xor ebx,edi - ror ecx,2 - add ebx,edx - add edx,DWORD [20+esp] - add ebx,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [24+esp] - xor edx,ecx - mov edi,DWORD [28+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [20+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - ror ecx,9 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [60+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [20+esp] - xor edx,ecx - mov edi,DWORD [24+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [4+esp] - mov esi,eax - ror ecx,9 - mov DWORD [esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [28+esp] - ror ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [64+esp] - xor ebx,edi - ror ecx,2 - add ebx,edx - add edx,DWORD [12+esp] - add ebx,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [16+esp] - xor edx,ecx - mov edi,DWORD [20+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [12+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [esp] - mov esi,ebx - ror ecx,9 - mov DWORD [28+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [24+esp] - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [68+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [12+esp] - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - mov esi,eax - ror ecx,9 - mov DWORD [24+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - ror ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [72+esp] - xor ebx,edi - ror ecx,2 - add ebx,edx - add edx,DWORD [4+esp] - add ebx,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [8+esp] - xor edx,ecx - mov edi,DWORD [12+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [4+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - ror ecx,9 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [76+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [4+esp] - xor edx,ecx - mov edi,DWORD [8+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [20+esp] - mov esi,eax - ror ecx,9 - mov DWORD [16+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [12+esp] - ror ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [80+esp] - xor ebx,edi - ror ecx,2 - add ebx,edx - add edx,DWORD [28+esp] - add ebx,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [esp] - xor edx,ecx - mov edi,DWORD [4+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [28+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [16+esp] - mov esi,ebx - ror ecx,9 - mov DWORD [12+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [8+esp] - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [84+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [28+esp] - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - mov esi,eax - ror ecx,9 - mov DWORD [8+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - ror ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [88+esp] - xor ebx,edi - ror ecx,2 - add ebx,edx - add edx,DWORD [20+esp] - add ebx,ecx - mov ecx,edx - ror edx,14 - mov esi,DWORD [24+esp] - xor edx,ecx - mov edi,DWORD [28+esp] - xor esi,edi - ror edx,5 - and esi,ecx - mov DWORD [20+esp],ecx - xor edx,ecx - xor edi,esi - ror edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - ror ecx,9 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - ror ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [92+esp] - xor eax,edi - ror ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - mov esi,DWORD [96+esp] - xor ebx,edi - mov ecx,DWORD [12+esp] - add eax,DWORD [esi] - add ebx,DWORD [4+esi] - add edi,DWORD [8+esi] - add ecx,DWORD [12+esi] - mov DWORD [esi],eax - mov DWORD [4+esi],ebx - mov DWORD [8+esi],edi - mov DWORD [12+esi],ecx - mov DWORD [4+esp],ebx - xor ebx,edi - mov DWORD [8+esp],edi - mov DWORD [12+esp],ecx - mov edi,DWORD [20+esp] - mov ecx,DWORD [24+esp] - add edx,DWORD [16+esi] - add edi,DWORD [20+esi] - add ecx,DWORD [24+esi] - mov DWORD [16+esi],edx - mov DWORD [20+esi],edi - mov DWORD [20+esp],edi - mov edi,DWORD [28+esp] - mov DWORD [24+esi],ecx - add edi,DWORD [28+esi] - mov DWORD [24+esp],ecx - mov DWORD [28+esi],edi - mov DWORD [28+esp],edi - mov edi,DWORD [100+esp] - movdqa xmm7,[64+ebp] - sub ebp,192 - cmp edi,DWORD [104+esp] - jb NEAR L$008grand_ssse3 - mov esp,DWORD [108+esp] - pop edi - pop esi - pop ebx - pop ebp - ret -global _sha256_block_data_order_avx -align 16 -_sha256_block_data_order_avx: -L$_sha256_block_data_order_avx_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov ebx,esp - call L$010pic_point -L$010pic_point: - pop ebp - lea ebp,[(L$K256-L$010pic_point)+ebp] - sub esp,16 - and esp,-64 - shl eax,6 - add eax,edi - mov DWORD [esp],esi - mov DWORD [4+esp],edi - mov DWORD [8+esp],eax - mov DWORD [12+esp],ebx - lea esp,[esp-96] - vzeroall - mov eax,DWORD [esi] - mov ebx,DWORD [4+esi] - mov ecx,DWORD [8+esi] - mov edi,DWORD [12+esi] - mov DWORD [4+esp],ebx - xor ebx,ecx - mov DWORD [8+esp],ecx - mov DWORD [12+esp],edi - mov edx,DWORD [16+esi] - mov edi,DWORD [20+esi] - mov ecx,DWORD [24+esi] - mov esi,DWORD [28+esi] - mov DWORD [20+esp],edi - mov edi,DWORD [100+esp] - mov DWORD [24+esp],ecx - mov DWORD [28+esp],esi - vmovdqa xmm7,[256+ebp] - jmp NEAR L$011grand_avx -align 32 -L$011grand_avx: - vmovdqu xmm0,[edi] - vmovdqu xmm1,[16+edi] - vmovdqu xmm2,[32+edi] - vmovdqu xmm3,[48+edi] - add edi,64 - vpshufb xmm0,xmm0,xmm7 - mov DWORD [100+esp],edi - vpshufb xmm1,xmm1,xmm7 - vpshufb xmm2,xmm2,xmm7 - vpaddd xmm4,xmm0,[ebp] - vpshufb xmm3,xmm3,xmm7 - vpaddd xmm5,xmm1,[16+ebp] - vpaddd xmm6,xmm2,[32+ebp] - vpaddd xmm7,xmm3,[48+ebp] - vmovdqa [32+esp],xmm4 - vmovdqa [48+esp],xmm5 - vmovdqa [64+esp],xmm6 - vmovdqa [80+esp],xmm7 - jmp NEAR L$012avx_00_47 -align 16 -L$012avx_00_47: - add ebp,64 - vpalignr xmm4,xmm1,xmm0,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [20+esp] - vpalignr xmm7,xmm3,xmm2,4 - xor edx,ecx - mov edi,DWORD [24+esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - vpaddd xmm0,xmm0,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD [4+esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD [28+esp] - vpshufd xmm7,xmm3,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD [32+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD [12+esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [16+esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD [20+esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD [12+esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD [esp] - vpaddd xmm0,xmm0,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [28+esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD [24+esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD [36+esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD [8+esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [12+esp] - vpaddd xmm0,xmm0,xmm7 - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - vpshufd xmm7,xmm0,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [24+esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD [40+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD [4+esp] - add ebx,ecx - vpaddd xmm0,xmm0,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [8+esp] - vpaddd xmm6,xmm0,[ebp] - xor edx,ecx - mov edi,DWORD [12+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [4+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [44+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - vmovdqa [32+esp],xmm6 - vpalignr xmm4,xmm2,xmm1,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [4+esp] - vpalignr xmm7,xmm0,xmm3,4 - xor edx,ecx - mov edi,DWORD [8+esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD [esp],ecx - vpaddd xmm1,xmm1,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD [20+esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [16+esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD [12+esp] - vpshufd xmm7,xmm0,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD [48+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD [28+esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD [4+esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD [28+esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD [16+esp] - vpaddd xmm1,xmm1,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [12+esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD [8+esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD [52+esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD [24+esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [28+esp] - vpaddd xmm1,xmm1,xmm7 - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - vpshufd xmm7,xmm1,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [8+esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD [56+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD [20+esp] - add ebx,ecx - vpaddd xmm1,xmm1,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [24+esp] - vpaddd xmm6,xmm1,[16+ebp] - xor edx,ecx - mov edi,DWORD [28+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [20+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [60+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - vmovdqa [48+esp],xmm6 - vpalignr xmm4,xmm3,xmm2,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [20+esp] - vpalignr xmm7,xmm1,xmm0,4 - xor edx,ecx - mov edi,DWORD [24+esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - vpaddd xmm2,xmm2,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD [4+esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD [28+esp] - vpshufd xmm7,xmm1,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD [64+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD [12+esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [16+esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD [20+esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD [12+esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD [esp] - vpaddd xmm2,xmm2,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [28+esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD [24+esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD [68+esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD [8+esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [12+esp] - vpaddd xmm2,xmm2,xmm7 - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - vpshufd xmm7,xmm2,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [24+esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD [72+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD [4+esp] - add ebx,ecx - vpaddd xmm2,xmm2,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [8+esp] - vpaddd xmm6,xmm2,[32+ebp] - xor edx,ecx - mov edi,DWORD [12+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [4+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [76+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - vmovdqa [64+esp],xmm6 - vpalignr xmm4,xmm0,xmm3,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [4+esp] - vpalignr xmm7,xmm2,xmm1,4 - xor edx,ecx - mov edi,DWORD [8+esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD [esp],ecx - vpaddd xmm3,xmm3,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD [20+esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [16+esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD [12+esp] - vpshufd xmm7,xmm2,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD [80+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD [28+esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD [4+esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD [28+esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD [16+esp] - vpaddd xmm3,xmm3,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [12+esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD [8+esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD [84+esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD [24+esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [28+esp] - vpaddd xmm3,xmm3,xmm7 - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - vpshufd xmm7,xmm3,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [8+esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD [88+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD [20+esp] - add ebx,ecx - vpaddd xmm3,xmm3,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [24+esp] - vpaddd xmm6,xmm3,[48+ebp] - xor edx,ecx - mov edi,DWORD [28+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [20+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [92+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - vmovdqa [80+esp],xmm6 - cmp DWORD [64+ebp],66051 - jne NEAR L$012avx_00_47 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [20+esp] - xor edx,ecx - mov edi,DWORD [24+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [4+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [28+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [32+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [12+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [16+esp] - xor edx,ecx - mov edi,DWORD [20+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [12+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [28+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [24+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [36+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [12+esp] - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [24+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [40+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [4+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [8+esp] - xor edx,ecx - mov edi,DWORD [12+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [4+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [44+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [4+esp] - xor edx,ecx - mov edi,DWORD [8+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [20+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [16+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [12+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [48+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [28+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [esp] - xor edx,ecx - mov edi,DWORD [4+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [28+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [16+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [12+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [8+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [52+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [28+esp] - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [8+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [56+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [20+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [24+esp] - xor edx,ecx - mov edi,DWORD [28+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [20+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [60+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [20+esp] - xor edx,ecx - mov edi,DWORD [24+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [4+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [28+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [64+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [12+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [16+esp] - xor edx,ecx - mov edi,DWORD [20+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [12+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [28+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [24+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [68+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [12+esp] - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [24+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [72+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [4+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [8+esp] - xor edx,ecx - mov edi,DWORD [12+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [4+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [76+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [4+esp] - xor edx,ecx - mov edi,DWORD [8+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [20+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [16+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [12+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [80+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [28+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [esp] - xor edx,ecx - mov edi,DWORD [4+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [28+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [16+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [12+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [8+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [84+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [28+esp] - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [8+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [88+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [20+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [24+esp] - xor edx,ecx - mov edi,DWORD [28+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [20+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [92+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - mov esi,DWORD [96+esp] - xor ebx,edi - mov ecx,DWORD [12+esp] - add eax,DWORD [esi] - add ebx,DWORD [4+esi] - add edi,DWORD [8+esi] - add ecx,DWORD [12+esi] - mov DWORD [esi],eax - mov DWORD [4+esi],ebx - mov DWORD [8+esi],edi - mov DWORD [12+esi],ecx - mov DWORD [4+esp],ebx - xor ebx,edi - mov DWORD [8+esp],edi - mov DWORD [12+esp],ecx - mov edi,DWORD [20+esp] - mov ecx,DWORD [24+esp] - add edx,DWORD [16+esi] - add edi,DWORD [20+esi] - add ecx,DWORD [24+esi] - mov DWORD [16+esi],edx - mov DWORD [20+esi],edi - mov DWORD [20+esp],edi - mov edi,DWORD [28+esp] - mov DWORD [24+esi],ecx - add edi,DWORD [28+esi] - mov DWORD [24+esp],ecx - mov DWORD [28+esi],edi - mov DWORD [28+esp],edi - mov edi,DWORD [100+esp] - vmovdqa xmm7,[64+ebp] - sub ebp,192 - cmp edi,DWORD [104+esp] - jb NEAR L$011grand_avx - mov esp,DWORD [108+esp] - vzeroall - pop edi - pop esi - pop ebx - pop ebp - ret -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-ios.ios.arm.S deleted file mode 100644 index b82984e3..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-ios.ios.arm.S +++ /dev/null @@ -1,2852 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__APPLE__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__) -@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. -@ -@ Licensed under the OpenSSL license (the "License"). You may not use -@ this file except in compliance with the License. You can obtain a copy -@ in the file LICENSE in the source distribution or at -@ https://www.openssl.org/source/license.html - - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ -@ Permission to use under GPL terms is granted. -@ ==================================================================== - -@ SHA256 block procedure for ARMv4. May 2007. - -@ Performance is ~2x better than gcc 3.4 generated code and in "abso- -@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per -@ byte [on single-issue Xscale PXA250 core]. - -@ July 2010. -@ -@ Rescheduling for dual-issue pipeline resulted in 22% improvement on -@ Cortex A8 core and ~20 cycles per processed byte. - -@ February 2011. -@ -@ Profiler-assisted and platform-specific optimization resulted in 16% -@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. - -@ September 2013. -@ -@ Add NEON implementation. On Cortex A8 it was measured to process one -@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon -@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only -@ code (meaning that latter performs sub-optimally, nothing was done -@ about it). - -@ May 2014. -@ -@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. - -#ifndef __KERNEL__ -# include -#else -# define __ARM_ARCH __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -#endif - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those -@ instructions are manually-encoded. (See unsha256.) - - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - - -.align 5 -K256: -.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.word 0 @ terminator -.align 5 - -.globl _sha256_block_data_order_nohw -.private_extern _sha256_block_data_order_nohw -#ifdef __thumb2__ -.thumb_func _sha256_block_data_order_nohw -#endif -_sha256_block_data_order_nohw: - add r2,r1,r2,lsl#6 @ len to point at the end of inp - stmdb sp!,{r0,r1,r2,r4-r11,lr} - ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} - adr r14,K256 - sub sp,sp,#16*4 @ alloca(X[16]) -Loop: -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ magic - eor r12,r12,r12 -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 0 -# if 0==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r8,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 0 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 0==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r8,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#0*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 0==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 0<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#2*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#15*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 1 -# if 1==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r7,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 1 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 1==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r7,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#1*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 1==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 1<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#3*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#0*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 2 -# if 2==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r6,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 2 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 2==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r6,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#2*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 2==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 2<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#4*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#1*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 3 -# if 3==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r5,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 3 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 3==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r5,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#3*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 3==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 3<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#5*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#2*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 4 -# if 4==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r4,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 4 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 4==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r4,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#4*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 4==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 4<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#6*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#3*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 5 -# if 5==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r11,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 5==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r11,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#5*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 5==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 5<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#7*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#4*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 6 -# if 6==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r10,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 6 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 6==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r10,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#6*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 6==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 6<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#8*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#5*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 7 -# if 7==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r9,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 7==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r9,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#7*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 7==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 7<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#9*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#6*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 8 -# if 8==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r8,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 8 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 8==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r8,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#8*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 8==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 8<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#10*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#7*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 9 -# if 9==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r7,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 9 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 9==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r7,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#9*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 9==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 9<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#11*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#8*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 10 -# if 10==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r6,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 10 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 10==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r6,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#10*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 10==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 10<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#12*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#9*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 11 -# if 11==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r5,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 11 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 11==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r5,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#11*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 11==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 11<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#13*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#10*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 12 -# if 12==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r4,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 12 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 12==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r4,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#12*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 12==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 12<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#14*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#11*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 13 -# if 13==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r11,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 13 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 13==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r11,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#13*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 13==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 13<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#15*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#12*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 14 -# if 14==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r10,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 14 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 14==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r10,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#14*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 14==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 14<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#0*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#13*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - @ ldr r2,[r1],#4 @ 15 -# if 15==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r9,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 15 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 15==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r9,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#15*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 15==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 15<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#1*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#14*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -Lrounds_16_xx: - @ ldr r2,[sp,#1*4] @ 16 - @ ldr r1,[sp,#14*4] - mov r0,r2,ror#7 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#0*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#9*4] - - add r12,r12,r0 - eor r0,r8,r8,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r8,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#0*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 16==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 16<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#2*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#15*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#2*4] @ 17 - @ ldr r1,[sp,#15*4] - mov r0,r2,ror#7 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#1*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#10*4] - - add r3,r3,r0 - eor r0,r7,r7,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r7,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#1*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 17==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 17<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#3*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#0*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#3*4] @ 18 - @ ldr r1,[sp,#0*4] - mov r0,r2,ror#7 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#2*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#11*4] - - add r12,r12,r0 - eor r0,r6,r6,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r6,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#2*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 18==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 18<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#4*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#1*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#4*4] @ 19 - @ ldr r1,[sp,#1*4] - mov r0,r2,ror#7 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#3*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#12*4] - - add r3,r3,r0 - eor r0,r5,r5,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r5,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#3*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 19==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 19<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#5*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#2*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#5*4] @ 20 - @ ldr r1,[sp,#2*4] - mov r0,r2,ror#7 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#4*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#13*4] - - add r12,r12,r0 - eor r0,r4,r4,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r4,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#4*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 20==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 20<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#6*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#3*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#6*4] @ 21 - @ ldr r1,[sp,#3*4] - mov r0,r2,ror#7 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#5*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#14*4] - - add r3,r3,r0 - eor r0,r11,r11,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r11,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#5*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 21==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 21<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#7*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#4*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#7*4] @ 22 - @ ldr r1,[sp,#4*4] - mov r0,r2,ror#7 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#6*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#15*4] - - add r12,r12,r0 - eor r0,r10,r10,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r10,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#6*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 22==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 22<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#8*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#5*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#8*4] @ 23 - @ ldr r1,[sp,#5*4] - mov r0,r2,ror#7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#7*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#0*4] - - add r3,r3,r0 - eor r0,r9,r9,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r9,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#7*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 23==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 23<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#9*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#6*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#9*4] @ 24 - @ ldr r1,[sp,#6*4] - mov r0,r2,ror#7 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#8*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#1*4] - - add r12,r12,r0 - eor r0,r8,r8,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r8,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#8*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 24==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 24<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#10*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#7*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#10*4] @ 25 - @ ldr r1,[sp,#7*4] - mov r0,r2,ror#7 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#9*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#2*4] - - add r3,r3,r0 - eor r0,r7,r7,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r7,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#9*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 25==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 25<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#11*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#8*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#11*4] @ 26 - @ ldr r1,[sp,#8*4] - mov r0,r2,ror#7 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#10*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#3*4] - - add r12,r12,r0 - eor r0,r6,r6,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r6,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#10*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 26==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 26<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#12*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#9*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#12*4] @ 27 - @ ldr r1,[sp,#9*4] - mov r0,r2,ror#7 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#11*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#4*4] - - add r3,r3,r0 - eor r0,r5,r5,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r5,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#11*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 27==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 27<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#13*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#10*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#13*4] @ 28 - @ ldr r1,[sp,#10*4] - mov r0,r2,ror#7 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#12*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#5*4] - - add r12,r12,r0 - eor r0,r4,r4,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r4,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#12*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 28==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 28<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#14*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#11*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#14*4] @ 29 - @ ldr r1,[sp,#11*4] - mov r0,r2,ror#7 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#13*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#6*4] - - add r3,r3,r0 - eor r0,r11,r11,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r11,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#13*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 29==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 29<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#15*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#12*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#15*4] @ 30 - @ ldr r1,[sp,#12*4] - mov r0,r2,ror#7 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#14*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#7*4] - - add r12,r12,r0 - eor r0,r10,r10,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r10,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#14*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 30==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 30<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#0*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#13*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#0*4] @ 31 - @ ldr r1,[sp,#13*4] - mov r0,r2,ror#7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#15*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#8*4] - - add r3,r3,r0 - eor r0,r9,r9,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r9,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#15*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 31==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 31<15 -# if __ARM_ARCH>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#1*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#14*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH>=7 - ite eq @ Thumb2 thing, sanity check in ARM -#endif - ldreq r3,[sp,#16*4] @ pull ctx - bne Lrounds_16_xx - - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldr r0,[r3,#0] - ldr r2,[r3,#4] - ldr r12,[r3,#8] - add r4,r4,r0 - ldr r0,[r3,#12] - add r5,r5,r2 - ldr r2,[r3,#16] - add r6,r6,r12 - ldr r12,[r3,#20] - add r7,r7,r0 - ldr r0,[r3,#24] - add r8,r8,r2 - ldr r2,[r3,#28] - add r9,r9,r12 - ldr r1,[sp,#17*4] @ pull inp - ldr r12,[sp,#18*4] @ pull inp+len - add r10,r10,r0 - add r11,r11,r2 - stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} - cmp r1,r12 - sub r14,r14,#256 @ rewind Ktbl - bne Loop - - add sp,sp,#19*4 @ destroy frame -#if __ARM_ARCH>=5 - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} -#else - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif - -#if __ARM_MAX_ARCH__>=7 - - - -LK256_shortcut_neon: -@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. -#if defined(__thumb2__) -.word K256-(LK256_add_neon+4) -#else -.word K256-(LK256_add_neon+8) -#endif - -.globl _sha256_block_data_order_neon -.private_extern _sha256_block_data_order_neon -#ifdef __thumb2__ -.thumb_func _sha256_block_data_order_neon -#endif -.align 5 -.skip 16 -_sha256_block_data_order_neon: - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - - sub r11,sp,#16*4+16 - - @ K256 is just at the boundary of being easily referenced by an ADR from - @ this function. In Arm mode, when building with __ARM_ARCH=6, it does - @ not fit. By moving code around, we could make it fit, but this is too - @ fragile. For simplicity, just load the offset from - @ .LK256_shortcut_neon. - @ - @ TODO(davidben): adrl would avoid a load, but clang-assembler does not - @ support it. We might be able to emulate it with a macro, but Android's - @ did not work when I tried it. - @ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm - ldr r14,LK256_shortcut_neon -LK256_add_neon: - add r14,pc,r14 - - bic r11,r11,#15 @ align for 128-bit stores - mov r12,sp - mov sp,r11 @ alloca - add r2,r1,r2,lsl#6 @ len to point at the end of inp - - vld1.8 {q0},[r1]! - vld1.8 {q1},[r1]! - vld1.8 {q2},[r1]! - vld1.8 {q3},[r1]! - vld1.32 {q8},[r14,:128]! - vld1.32 {q9},[r14,:128]! - vld1.32 {q10},[r14,:128]! - vld1.32 {q11},[r14,:128]! - vrev32.8 q0,q0 @ yes, even on - str r0,[sp,#64] - vrev32.8 q1,q1 @ big-endian - str r1,[sp,#68] - mov r1,sp - vrev32.8 q2,q2 - str r2,[sp,#72] - vrev32.8 q3,q3 - str r12,[sp,#76] @ save original sp - vadd.i32 q8,q8,q0 - vadd.i32 q9,q9,q1 - vst1.32 {q8},[r1,:128]! - vadd.i32 q10,q10,q2 - vst1.32 {q9},[r1,:128]! - vadd.i32 q11,q11,q3 - vst1.32 {q10},[r1,:128]! - vst1.32 {q11},[r1,:128]! - - ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} - sub r1,r1,#64 - ldr r2,[sp,#0] - eor r12,r12,r12 - eor r3,r5,r6 - b L_00_48 - -.align 4 -L_00_48: - vext.8 q8,q0,q1,#4 - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - vext.8 q9,q2,q3,#4 - add r4,r4,r12 - and r2,r2,r8 - eor r12,r0,r8,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vadd.i32 q0,q0,q9 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - vshr.u32 q9,q8,#3 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#4] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - veor q9,q9,q10 - add r10,r10,r2 - vsli.32 q11,q8,#14 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - vshr.u32 d24,d7,#17 - add r11,r11,r3 - and r2,r2,r7 - veor q9,q9,q11 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - vsli.32 d24,d7,#15 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - vshr.u32 d25,d7,#10 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - vadd.i32 q0,q0,q9 - add r10,r10,r2 - ldr r2,[sp,#8] - veor d25,d25,d24 - and r12,r12,r3 - add r6,r6,r10 - vshr.u32 d24,d7,#19 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - vsli.32 d24,d7,#13 - add r9,r9,r2 - eor r2,r7,r8 - veor d25,d25,d24 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - vadd.i32 d0,d0,d25 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - vshr.u32 d24,d0,#17 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - vsli.32 d24,d0,#15 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - vshr.u32 d25,d0,#10 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - veor d25,d25,d24 - ldr r2,[sp,#12] - and r3,r3,r12 - vshr.u32 d24,d0,#19 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - vld1.32 {q8},[r14,:128]! - add r8,r8,r2 - vsli.32 d24,d0,#13 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - veor d25,d25,d24 - add r9,r9,r3 - and r2,r2,r5 - vadd.i32 d1,d1,d25 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - vadd.i32 q8,q8,q0 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#16] - and r12,r12,r3 - add r4,r4,r8 - vst1.32 {q8},[r1,:128]! - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vext.8 q8,q1,q2,#4 - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - vext.8 q9,q3,q0,#4 - add r8,r8,r12 - and r2,r2,r4 - eor r12,r0,r4,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vadd.i32 q1,q1,q9 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - vshr.u32 q9,q8,#3 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#20] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - veor q9,q9,q10 - add r6,r6,r2 - vsli.32 q11,q8,#14 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - vshr.u32 d24,d1,#17 - add r7,r7,r3 - and r2,r2,r11 - veor q9,q9,q11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - vsli.32 d24,d1,#15 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - vshr.u32 d25,d1,#10 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - vadd.i32 q1,q1,q9 - add r6,r6,r2 - ldr r2,[sp,#24] - veor d25,d25,d24 - and r12,r12,r3 - add r10,r10,r6 - vshr.u32 d24,d1,#19 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - vsli.32 d24,d1,#13 - add r5,r5,r2 - eor r2,r11,r4 - veor d25,d25,d24 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - vadd.i32 d2,d2,d25 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - vshr.u32 d24,d2,#17 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - vsli.32 d24,d2,#15 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - vshr.u32 d25,d2,#10 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - veor d25,d25,d24 - ldr r2,[sp,#28] - and r3,r3,r12 - vshr.u32 d24,d2,#19 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - vld1.32 {q8},[r14,:128]! - add r4,r4,r2 - vsli.32 d24,d2,#13 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - veor d25,d25,d24 - add r5,r5,r3 - and r2,r2,r9 - vadd.i32 d3,d3,d25 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - vadd.i32 q8,q8,q1 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#32] - and r12,r12,r3 - add r8,r8,r4 - vst1.32 {q8},[r1,:128]! - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vext.8 q8,q2,q3,#4 - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - vext.8 q9,q0,q1,#4 - add r4,r4,r12 - and r2,r2,r8 - eor r12,r0,r8,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vadd.i32 q2,q2,q9 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - vshr.u32 q9,q8,#3 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#36] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - veor q9,q9,q10 - add r10,r10,r2 - vsli.32 q11,q8,#14 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - vshr.u32 d24,d3,#17 - add r11,r11,r3 - and r2,r2,r7 - veor q9,q9,q11 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - vsli.32 d24,d3,#15 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - vshr.u32 d25,d3,#10 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - vadd.i32 q2,q2,q9 - add r10,r10,r2 - ldr r2,[sp,#40] - veor d25,d25,d24 - and r12,r12,r3 - add r6,r6,r10 - vshr.u32 d24,d3,#19 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - vsli.32 d24,d3,#13 - add r9,r9,r2 - eor r2,r7,r8 - veor d25,d25,d24 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - vadd.i32 d4,d4,d25 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - vshr.u32 d24,d4,#17 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - vsli.32 d24,d4,#15 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - vshr.u32 d25,d4,#10 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - veor d25,d25,d24 - ldr r2,[sp,#44] - and r3,r3,r12 - vshr.u32 d24,d4,#19 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - vld1.32 {q8},[r14,:128]! - add r8,r8,r2 - vsli.32 d24,d4,#13 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - veor d25,d25,d24 - add r9,r9,r3 - and r2,r2,r5 - vadd.i32 d5,d5,d25 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - vadd.i32 q8,q8,q2 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#48] - and r12,r12,r3 - add r4,r4,r8 - vst1.32 {q8},[r1,:128]! - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vext.8 q8,q3,q0,#4 - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - vext.8 q9,q1,q2,#4 - add r8,r8,r12 - and r2,r2,r4 - eor r12,r0,r4,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vadd.i32 q3,q3,q9 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - vshr.u32 q9,q8,#3 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#52] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - veor q9,q9,q10 - add r6,r6,r2 - vsli.32 q11,q8,#14 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - vshr.u32 d24,d5,#17 - add r7,r7,r3 - and r2,r2,r11 - veor q9,q9,q11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - vsli.32 d24,d5,#15 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - vshr.u32 d25,d5,#10 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - vadd.i32 q3,q3,q9 - add r6,r6,r2 - ldr r2,[sp,#56] - veor d25,d25,d24 - and r12,r12,r3 - add r10,r10,r6 - vshr.u32 d24,d5,#19 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - vsli.32 d24,d5,#13 - add r5,r5,r2 - eor r2,r11,r4 - veor d25,d25,d24 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - vadd.i32 d6,d6,d25 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - vshr.u32 d24,d6,#17 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - vsli.32 d24,d6,#15 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - vshr.u32 d25,d6,#10 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - veor d25,d25,d24 - ldr r2,[sp,#60] - and r3,r3,r12 - vshr.u32 d24,d6,#19 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - vld1.32 {q8},[r14,:128]! - add r4,r4,r2 - vsli.32 d24,d6,#13 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - veor d25,d25,d24 - add r5,r5,r3 - and r2,r2,r9 - vadd.i32 d7,d7,d25 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - vadd.i32 q8,q8,q3 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[r14] - and r12,r12,r3 - add r8,r8,r4 - vst1.32 {q8},[r1,:128]! - add r4,r4,r0,ror#2 - eor r12,r12,r6 - teq r2,#0 @ check for K256 terminator - ldr r2,[sp,#0] - sub r1,r1,#64 - bne L_00_48 - - ldr r1,[sp,#68] - ldr r0,[sp,#72] - sub r14,r14,#256 @ rewind r14 - teq r1,r0 - it eq - subeq r1,r1,#64 @ avoid SEGV - vld1.8 {q0},[r1]! @ load next input block - vld1.8 {q1},[r1]! - vld1.8 {q2},[r1]! - vld1.8 {q3},[r1]! - it ne - strne r1,[sp,#68] - mov r1,sp - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - add r4,r4,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r8 - eor r12,r0,r8,ror#19 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vrev32.8 q0,q0 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vadd.i32 q8,q8,q0 - ldr r2,[sp,#4] - and r3,r3,r12 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - add r10,r10,r2 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - add r11,r11,r3 - and r2,r2,r7 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - add r10,r10,r2 - ldr r2,[sp,#8] - and r12,r12,r3 - add r6,r6,r10 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - add r9,r9,r2 - eor r2,r7,r8 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - ldr r2,[sp,#12] - and r3,r3,r12 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - add r8,r8,r2 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - add r9,r9,r3 - and r2,r2,r5 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#16] - and r12,r12,r3 - add r4,r4,r8 - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vst1.32 {q8},[r1,:128]! - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - add r8,r8,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r4 - eor r12,r0,r4,ror#19 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vrev32.8 q1,q1 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vadd.i32 q8,q8,q1 - ldr r2,[sp,#20] - and r3,r3,r12 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - add r6,r6,r2 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - add r7,r7,r3 - and r2,r2,r11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - add r6,r6,r2 - ldr r2,[sp,#24] - and r12,r12,r3 - add r10,r10,r6 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - add r5,r5,r2 - eor r2,r11,r4 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - ldr r2,[sp,#28] - and r3,r3,r12 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - add r4,r4,r2 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - add r5,r5,r3 - and r2,r2,r9 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#32] - and r12,r12,r3 - add r8,r8,r4 - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vst1.32 {q8},[r1,:128]! - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - add r4,r4,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r8 - eor r12,r0,r8,ror#19 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vrev32.8 q2,q2 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vadd.i32 q8,q8,q2 - ldr r2,[sp,#36] - and r3,r3,r12 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - add r10,r10,r2 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - add r11,r11,r3 - and r2,r2,r7 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - add r10,r10,r2 - ldr r2,[sp,#40] - and r12,r12,r3 - add r6,r6,r10 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - add r9,r9,r2 - eor r2,r7,r8 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - ldr r2,[sp,#44] - and r3,r3,r12 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - add r8,r8,r2 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - add r9,r9,r3 - and r2,r2,r5 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#48] - and r12,r12,r3 - add r4,r4,r8 - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vst1.32 {q8},[r1,:128]! - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - add r8,r8,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r4 - eor r12,r0,r4,ror#19 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vrev32.8 q3,q3 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vadd.i32 q8,q8,q3 - ldr r2,[sp,#52] - and r3,r3,r12 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - add r6,r6,r2 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - add r7,r7,r3 - and r2,r2,r11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - add r6,r6,r2 - ldr r2,[sp,#56] - and r12,r12,r3 - add r10,r10,r6 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - add r5,r5,r2 - eor r2,r11,r4 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - ldr r2,[sp,#60] - and r3,r3,r12 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - add r4,r4,r2 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - add r5,r5,r3 - and r2,r2,r9 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#64] - and r12,r12,r3 - add r8,r8,r4 - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vst1.32 {q8},[r1,:128]! - ldr r0,[r2,#0] - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldr r12,[r2,#4] - ldr r3,[r2,#8] - ldr r1,[r2,#12] - add r4,r4,r0 @ accumulate - ldr r0,[r2,#16] - add r5,r5,r12 - ldr r12,[r2,#20] - add r6,r6,r3 - ldr r3,[r2,#24] - add r7,r7,r1 - ldr r1,[r2,#28] - add r8,r8,r0 - str r4,[r2],#4 - add r9,r9,r12 - str r5,[r2],#4 - add r10,r10,r3 - str r6,[r2],#4 - add r11,r11,r1 - str r7,[r2],#4 - stmia r2,{r8,r9,r10,r11} - - ittte ne - movne r1,sp - ldrne r2,[sp,#0] - eorne r12,r12,r12 - ldreq sp,[sp,#76] @ restore original sp - itt ne - eorne r3,r5,r6 - bne L_00_48 - - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} - -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - -# if defined(__thumb2__) -# define INST(a,b,c,d) .byte c,d|0xc,a,b -# else -# define INST(a,b,c,d) .byte a,b,c,d -# endif - -LK256_shortcut_hw: -@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. -#if defined(__thumb2__) -.word K256-(LK256_add_hw+4) -#else -.word K256-(LK256_add_hw+8) -#endif - -.globl _sha256_block_data_order_hw -.private_extern _sha256_block_data_order_hw -#ifdef __thumb2__ -.thumb_func _sha256_block_data_order_hw -#endif -.align 5 -_sha256_block_data_order_hw: - @ K256 is too far to reference from one ADR command in Thumb mode. In - @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte - @ boundary. For simplicity, just load the offset from .LK256_shortcut_hw. - ldr r3,LK256_shortcut_hw -LK256_add_hw: - add r3,pc,r3 - - vld1.32 {q0,q1},[r0] - add r2,r1,r2,lsl#6 @ len to point at the end of inp - b Loop_v8 - -.align 4 -Loop_v8: - vld1.8 {q8,q9},[r1]! - vld1.8 {q10,q11},[r1]! - vld1.32 {q12},[r3]! - vrev32.8 q8,q8 - vrev32.8 q9,q9 - vrev32.8 q10,q10 - vrev32.8 q11,q11 - vmov q14,q0 @ offload - vmov q15,q1 - teq r1,r2 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - - vld1.32 {q13},[r3] - vadd.i32 q12,q12,q10 - sub r3,r3,#256-16 @ rewind - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - - vadd.i32 q13,q13,q11 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - - vadd.i32 q0,q0,q14 - vadd.i32 q1,q1,q15 - it ne - bne Loop_v8 - - vst1.32 {q0,q1},[r0] - - bx lr @ bx lr - -#endif -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__) -#endif // defined(__arm__) && defined(__APPLE__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-windows.windows.x86.S deleted file mode 100644 index 8e1da2bb..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-windows.windows.x86.S +++ /dev/null @@ -1,2853 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -;extern _OPENSSL_ia32cap_P -global _sha512_block_data_order -align 16 -_sha512_block_data_order: -L$_sha512_block_data_order_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov ebx,esp - call L$000pic_point -L$000pic_point: - pop ebp - lea ebp,[(L$001K512-L$000pic_point)+ebp] - sub esp,16 - and esp,-64 - shl eax,7 - add eax,edi - mov DWORD [esp],esi - mov DWORD [4+esp],edi - mov DWORD [8+esp],eax - mov DWORD [12+esp],ebx - lea edx,[_OPENSSL_ia32cap_P] - mov ecx,DWORD [edx] - test ecx,67108864 - jz NEAR L$002loop_x86 - mov edx,DWORD [4+edx] - movq mm0,[esi] - and ecx,16777216 - movq mm1,[8+esi] - and edx,512 - movq mm2,[16+esi] - or ecx,edx - movq mm3,[24+esi] - movq mm4,[32+esi] - movq mm5,[40+esi] - movq mm6,[48+esi] - movq mm7,[56+esi] - cmp ecx,16777728 - je NEAR L$003SSSE3 - sub esp,80 - jmp NEAR L$004loop_sse2 -align 16 -L$004loop_sse2: - movq [8+esp],mm1 - movq [16+esp],mm2 - movq [24+esp],mm3 - movq [40+esp],mm5 - movq [48+esp],mm6 - pxor mm2,mm1 - movq [56+esp],mm7 - movq mm3,mm0 - mov eax,DWORD [edi] - mov ebx,DWORD [4+edi] - add edi,8 - mov edx,15 - bswap eax - bswap ebx - jmp NEAR L$00500_14_sse2 -align 16 -L$00500_14_sse2: - movd mm1,eax - mov eax,DWORD [edi] - movd mm7,ebx - mov ebx,DWORD [4+edi] - add edi,8 - bswap eax - bswap ebx - punpckldq mm7,mm1 - movq mm1,mm4 - pxor mm5,mm6 - psrlq mm1,14 - movq [32+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - movq mm0,mm3 - movq [72+esp],mm7 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[56+esp] - pxor mm3,mm1 - psllq mm4,4 - paddq mm7,[ebp] - pxor mm3,mm4 - movq mm4,[24+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[8+esp] - psrlq mm5,6 - pxor mm7,mm6 - sub esp,8 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[40+esp] - paddq mm3,mm2 - movq mm2,mm0 - add ebp,8 - paddq mm3,mm6 - movq mm6,[48+esp] - dec edx - jnz NEAR L$00500_14_sse2 - movd mm1,eax - movd mm7,ebx - punpckldq mm7,mm1 - movq mm1,mm4 - pxor mm5,mm6 - psrlq mm1,14 - movq [32+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - movq mm0,mm3 - movq [72+esp],mm7 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[56+esp] - pxor mm3,mm1 - psllq mm4,4 - paddq mm7,[ebp] - pxor mm3,mm4 - movq mm4,[24+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[8+esp] - psrlq mm5,6 - pxor mm7,mm6 - sub esp,8 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm7,[192+esp] - paddq mm3,mm2 - movq mm2,mm0 - add ebp,8 - paddq mm3,mm6 - pxor mm0,mm0 - mov edx,32 - jmp NEAR L$00616_79_sse2 -align 16 -L$00616_79_sse2: - movq mm5,[88+esp] - movq mm1,mm7 - psrlq mm7,1 - movq mm6,mm5 - psrlq mm5,6 - psllq mm1,56 - paddq mm0,mm3 - movq mm3,mm7 - psrlq mm7,6 - pxor mm3,mm1 - psllq mm1,7 - pxor mm3,mm7 - psrlq mm7,1 - pxor mm3,mm1 - movq mm1,mm5 - psrlq mm5,13 - pxor mm7,mm3 - psllq mm6,3 - pxor mm1,mm5 - paddq mm7,[200+esp] - pxor mm1,mm6 - psrlq mm5,42 - paddq mm7,[128+esp] - pxor mm1,mm5 - psllq mm6,42 - movq mm5,[40+esp] - pxor mm1,mm6 - movq mm6,[48+esp] - paddq mm7,mm1 - movq mm1,mm4 - pxor mm5,mm6 - psrlq mm1,14 - movq [32+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - movq [72+esp],mm7 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[56+esp] - pxor mm3,mm1 - psllq mm4,4 - paddq mm7,[ebp] - pxor mm3,mm4 - movq mm4,[24+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[8+esp] - psrlq mm5,6 - pxor mm7,mm6 - sub esp,8 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm7,[192+esp] - paddq mm2,mm6 - add ebp,8 - movq mm5,[88+esp] - movq mm1,mm7 - psrlq mm7,1 - movq mm6,mm5 - psrlq mm5,6 - psllq mm1,56 - paddq mm2,mm3 - movq mm3,mm7 - psrlq mm7,6 - pxor mm3,mm1 - psllq mm1,7 - pxor mm3,mm7 - psrlq mm7,1 - pxor mm3,mm1 - movq mm1,mm5 - psrlq mm5,13 - pxor mm7,mm3 - psllq mm6,3 - pxor mm1,mm5 - paddq mm7,[200+esp] - pxor mm1,mm6 - psrlq mm5,42 - paddq mm7,[128+esp] - pxor mm1,mm5 - psllq mm6,42 - movq mm5,[40+esp] - pxor mm1,mm6 - movq mm6,[48+esp] - paddq mm7,mm1 - movq mm1,mm4 - pxor mm5,mm6 - psrlq mm1,14 - movq [32+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - movq [72+esp],mm7 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[56+esp] - pxor mm3,mm1 - psllq mm4,4 - paddq mm7,[ebp] - pxor mm3,mm4 - movq mm4,[24+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[8+esp] - psrlq mm5,6 - pxor mm7,mm6 - sub esp,8 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm7,[192+esp] - paddq mm0,mm6 - add ebp,8 - dec edx - jnz NEAR L$00616_79_sse2 - paddq mm0,mm3 - movq mm1,[8+esp] - movq mm3,[24+esp] - movq mm5,[40+esp] - movq mm6,[48+esp] - movq mm7,[56+esp] - pxor mm2,mm1 - paddq mm0,[esi] - paddq mm1,[8+esi] - paddq mm2,[16+esi] - paddq mm3,[24+esi] - paddq mm4,[32+esi] - paddq mm5,[40+esi] - paddq mm6,[48+esi] - paddq mm7,[56+esi] - mov eax,640 - movq [esi],mm0 - movq [8+esi],mm1 - movq [16+esi],mm2 - movq [24+esi],mm3 - movq [32+esi],mm4 - movq [40+esi],mm5 - movq [48+esi],mm6 - movq [56+esi],mm7 - lea esp,[eax*1+esp] - sub ebp,eax - cmp edi,DWORD [88+esp] - jb NEAR L$004loop_sse2 - mov esp,DWORD [92+esp] - emms - pop edi - pop esi - pop ebx - pop ebp - ret -align 32 -L$003SSSE3: - lea edx,[esp-64] - sub esp,256 - movdqa xmm1,[640+ebp] - movdqu xmm0,[edi] -db 102,15,56,0,193 - movdqa xmm3,[ebp] - movdqa xmm2,xmm1 - movdqu xmm1,[16+edi] - paddq xmm3,xmm0 -db 102,15,56,0,202 - movdqa [edx-128],xmm3 - movdqa xmm4,[16+ebp] - movdqa xmm3,xmm2 - movdqu xmm2,[32+edi] - paddq xmm4,xmm1 -db 102,15,56,0,211 - movdqa [edx-112],xmm4 - movdqa xmm5,[32+ebp] - movdqa xmm4,xmm3 - movdqu xmm3,[48+edi] - paddq xmm5,xmm2 -db 102,15,56,0,220 - movdqa [edx-96],xmm5 - movdqa xmm6,[48+ebp] - movdqa xmm5,xmm4 - movdqu xmm4,[64+edi] - paddq xmm6,xmm3 -db 102,15,56,0,229 - movdqa [edx-80],xmm6 - movdqa xmm7,[64+ebp] - movdqa xmm6,xmm5 - movdqu xmm5,[80+edi] - paddq xmm7,xmm4 -db 102,15,56,0,238 - movdqa [edx-64],xmm7 - movdqa [edx],xmm0 - movdqa xmm0,[80+ebp] - movdqa xmm7,xmm6 - movdqu xmm6,[96+edi] - paddq xmm0,xmm5 -db 102,15,56,0,247 - movdqa [edx-48],xmm0 - movdqa [16+edx],xmm1 - movdqa xmm1,[96+ebp] - movdqa xmm0,xmm7 - movdqu xmm7,[112+edi] - paddq xmm1,xmm6 -db 102,15,56,0,248 - movdqa [edx-32],xmm1 - movdqa [32+edx],xmm2 - movdqa xmm2,[112+ebp] - movdqa xmm0,[edx] - paddq xmm2,xmm7 - movdqa [edx-16],xmm2 - nop -align 32 -L$007loop_ssse3: - movdqa xmm2,[16+edx] - movdqa [48+edx],xmm3 - lea ebp,[128+ebp] - movq [8+esp],mm1 - mov ebx,edi - movq [16+esp],mm2 - lea edi,[128+edi] - movq [24+esp],mm3 - cmp edi,eax - movq [40+esp],mm5 - cmovb ebx,edi - movq [48+esp],mm6 - mov ecx,4 - pxor mm2,mm1 - movq [56+esp],mm7 - pxor mm3,mm3 - jmp NEAR L$00800_47_ssse3 -align 32 -L$00800_47_ssse3: - movdqa xmm3,xmm5 - movdqa xmm1,xmm2 -db 102,15,58,15,208,8 - movdqa [edx],xmm4 -db 102,15,58,15,220,8 - movdqa xmm4,xmm2 - psrlq xmm2,7 - paddq xmm0,xmm3 - movdqa xmm3,xmm4 - psrlq xmm4,1 - psllq xmm3,56 - pxor xmm2,xmm4 - psrlq xmm4,7 - pxor xmm2,xmm3 - psllq xmm3,7 - pxor xmm2,xmm4 - movdqa xmm4,xmm7 - pxor xmm2,xmm3 - movdqa xmm3,xmm7 - psrlq xmm4,6 - paddq xmm0,xmm2 - movdqa xmm2,xmm7 - psrlq xmm3,19 - psllq xmm2,3 - pxor xmm4,xmm3 - psrlq xmm3,42 - pxor xmm4,xmm2 - psllq xmm2,42 - pxor xmm4,xmm3 - movdqa xmm3,[32+edx] - pxor xmm4,xmm2 - movdqa xmm2,[ebp] - movq mm1,mm4 - paddq xmm0,xmm4 - movq mm7,[edx-128] - pxor mm5,mm6 - psrlq mm1,14 - movq [32+esp],mm4 - paddq xmm2,xmm0 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[56+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[24+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[8+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[32+esp] - paddq mm2,mm6 - movq mm6,[40+esp] - movq mm1,mm4 - movq mm7,[edx-120] - pxor mm5,mm6 - psrlq mm1,14 - movq [24+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [56+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[48+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[16+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[24+esp] - paddq mm0,mm6 - movq mm6,[32+esp] - movdqa [edx-128],xmm2 - movdqa xmm4,xmm6 - movdqa xmm2,xmm3 -db 102,15,58,15,217,8 - movdqa [16+edx],xmm5 -db 102,15,58,15,229,8 - movdqa xmm5,xmm3 - psrlq xmm3,7 - paddq xmm1,xmm4 - movdqa xmm4,xmm5 - psrlq xmm5,1 - psllq xmm4,56 - pxor xmm3,xmm5 - psrlq xmm5,7 - pxor xmm3,xmm4 - psllq xmm4,7 - pxor xmm3,xmm5 - movdqa xmm5,xmm0 - pxor xmm3,xmm4 - movdqa xmm4,xmm0 - psrlq xmm5,6 - paddq xmm1,xmm3 - movdqa xmm3,xmm0 - psrlq xmm4,19 - psllq xmm3,3 - pxor xmm5,xmm4 - psrlq xmm4,42 - pxor xmm5,xmm3 - psllq xmm3,42 - pxor xmm5,xmm4 - movdqa xmm4,[48+edx] - pxor xmm5,xmm3 - movdqa xmm3,[16+ebp] - movq mm1,mm4 - paddq xmm1,xmm5 - movq mm7,[edx-112] - pxor mm5,mm6 - psrlq mm1,14 - movq [16+esp],mm4 - paddq xmm3,xmm1 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [48+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[40+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[8+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[56+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[16+esp] - paddq mm2,mm6 - movq mm6,[24+esp] - movq mm1,mm4 - movq mm7,[edx-104] - pxor mm5,mm6 - psrlq mm1,14 - movq [8+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [40+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[32+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[48+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[8+esp] - paddq mm0,mm6 - movq mm6,[16+esp] - movdqa [edx-112],xmm3 - movdqa xmm5,xmm7 - movdqa xmm3,xmm4 -db 102,15,58,15,226,8 - movdqa [32+edx],xmm6 -db 102,15,58,15,238,8 - movdqa xmm6,xmm4 - psrlq xmm4,7 - paddq xmm2,xmm5 - movdqa xmm5,xmm6 - psrlq xmm6,1 - psllq xmm5,56 - pxor xmm4,xmm6 - psrlq xmm6,7 - pxor xmm4,xmm5 - psllq xmm5,7 - pxor xmm4,xmm6 - movdqa xmm6,xmm1 - pxor xmm4,xmm5 - movdqa xmm5,xmm1 - psrlq xmm6,6 - paddq xmm2,xmm4 - movdqa xmm4,xmm1 - psrlq xmm5,19 - psllq xmm4,3 - pxor xmm6,xmm5 - psrlq xmm5,42 - pxor xmm6,xmm4 - psllq xmm4,42 - pxor xmm6,xmm5 - movdqa xmm5,[edx] - pxor xmm6,xmm4 - movdqa xmm4,[32+ebp] - movq mm1,mm4 - paddq xmm2,xmm6 - movq mm7,[edx-96] - pxor mm5,mm6 - psrlq mm1,14 - movq [esp],mm4 - paddq xmm4,xmm2 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [32+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[24+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[56+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[40+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[esp] - paddq mm2,mm6 - movq mm6,[8+esp] - movq mm1,mm4 - movq mm7,[edx-88] - pxor mm5,mm6 - psrlq mm1,14 - movq [56+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [24+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[16+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[48+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[32+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[56+esp] - paddq mm0,mm6 - movq mm6,[esp] - movdqa [edx-96],xmm4 - movdqa xmm6,xmm0 - movdqa xmm4,xmm5 -db 102,15,58,15,235,8 - movdqa [48+edx],xmm7 -db 102,15,58,15,247,8 - movdqa xmm7,xmm5 - psrlq xmm5,7 - paddq xmm3,xmm6 - movdqa xmm6,xmm7 - psrlq xmm7,1 - psllq xmm6,56 - pxor xmm5,xmm7 - psrlq xmm7,7 - pxor xmm5,xmm6 - psllq xmm6,7 - pxor xmm5,xmm7 - movdqa xmm7,xmm2 - pxor xmm5,xmm6 - movdqa xmm6,xmm2 - psrlq xmm7,6 - paddq xmm3,xmm5 - movdqa xmm5,xmm2 - psrlq xmm6,19 - psllq xmm5,3 - pxor xmm7,xmm6 - psrlq xmm6,42 - pxor xmm7,xmm5 - psllq xmm5,42 - pxor xmm7,xmm6 - movdqa xmm6,[16+edx] - pxor xmm7,xmm5 - movdqa xmm5,[48+ebp] - movq mm1,mm4 - paddq xmm3,xmm7 - movq mm7,[edx-80] - pxor mm5,mm6 - psrlq mm1,14 - movq [48+esp],mm4 - paddq xmm5,xmm3 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [16+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[8+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[40+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[24+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[48+esp] - paddq mm2,mm6 - movq mm6,[56+esp] - movq mm1,mm4 - movq mm7,[edx-72] - pxor mm5,mm6 - psrlq mm1,14 - movq [40+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [8+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[32+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[16+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[40+esp] - paddq mm0,mm6 - movq mm6,[48+esp] - movdqa [edx-80],xmm5 - movdqa xmm7,xmm1 - movdqa xmm5,xmm6 -db 102,15,58,15,244,8 - movdqa [edx],xmm0 -db 102,15,58,15,248,8 - movdqa xmm0,xmm6 - psrlq xmm6,7 - paddq xmm4,xmm7 - movdqa xmm7,xmm0 - psrlq xmm0,1 - psllq xmm7,56 - pxor xmm6,xmm0 - psrlq xmm0,7 - pxor xmm6,xmm7 - psllq xmm7,7 - pxor xmm6,xmm0 - movdqa xmm0,xmm3 - pxor xmm6,xmm7 - movdqa xmm7,xmm3 - psrlq xmm0,6 - paddq xmm4,xmm6 - movdqa xmm6,xmm3 - psrlq xmm7,19 - psllq xmm6,3 - pxor xmm0,xmm7 - psrlq xmm7,42 - pxor xmm0,xmm6 - psllq xmm6,42 - pxor xmm0,xmm7 - movdqa xmm7,[32+edx] - pxor xmm0,xmm6 - movdqa xmm6,[64+ebp] - movq mm1,mm4 - paddq xmm4,xmm0 - movq mm7,[edx-64] - pxor mm5,mm6 - psrlq mm1,14 - movq [32+esp],mm4 - paddq xmm6,xmm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[56+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[24+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[8+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[32+esp] - paddq mm2,mm6 - movq mm6,[40+esp] - movq mm1,mm4 - movq mm7,[edx-56] - pxor mm5,mm6 - psrlq mm1,14 - movq [24+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [56+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[48+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[16+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[24+esp] - paddq mm0,mm6 - movq mm6,[32+esp] - movdqa [edx-64],xmm6 - movdqa xmm0,xmm2 - movdqa xmm6,xmm7 -db 102,15,58,15,253,8 - movdqa [16+edx],xmm1 -db 102,15,58,15,193,8 - movdqa xmm1,xmm7 - psrlq xmm7,7 - paddq xmm5,xmm0 - movdqa xmm0,xmm1 - psrlq xmm1,1 - psllq xmm0,56 - pxor xmm7,xmm1 - psrlq xmm1,7 - pxor xmm7,xmm0 - psllq xmm0,7 - pxor xmm7,xmm1 - movdqa xmm1,xmm4 - pxor xmm7,xmm0 - movdqa xmm0,xmm4 - psrlq xmm1,6 - paddq xmm5,xmm7 - movdqa xmm7,xmm4 - psrlq xmm0,19 - psllq xmm7,3 - pxor xmm1,xmm0 - psrlq xmm0,42 - pxor xmm1,xmm7 - psllq xmm7,42 - pxor xmm1,xmm0 - movdqa xmm0,[48+edx] - pxor xmm1,xmm7 - movdqa xmm7,[80+ebp] - movq mm1,mm4 - paddq xmm5,xmm1 - movq mm7,[edx-48] - pxor mm5,mm6 - psrlq mm1,14 - movq [16+esp],mm4 - paddq xmm7,xmm5 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [48+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[40+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[8+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[56+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[16+esp] - paddq mm2,mm6 - movq mm6,[24+esp] - movq mm1,mm4 - movq mm7,[edx-40] - pxor mm5,mm6 - psrlq mm1,14 - movq [8+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [40+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[32+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[48+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[8+esp] - paddq mm0,mm6 - movq mm6,[16+esp] - movdqa [edx-48],xmm7 - movdqa xmm1,xmm3 - movdqa xmm7,xmm0 -db 102,15,58,15,198,8 - movdqa [32+edx],xmm2 -db 102,15,58,15,202,8 - movdqa xmm2,xmm0 - psrlq xmm0,7 - paddq xmm6,xmm1 - movdqa xmm1,xmm2 - psrlq xmm2,1 - psllq xmm1,56 - pxor xmm0,xmm2 - psrlq xmm2,7 - pxor xmm0,xmm1 - psllq xmm1,7 - pxor xmm0,xmm2 - movdqa xmm2,xmm5 - pxor xmm0,xmm1 - movdqa xmm1,xmm5 - psrlq xmm2,6 - paddq xmm6,xmm0 - movdqa xmm0,xmm5 - psrlq xmm1,19 - psllq xmm0,3 - pxor xmm2,xmm1 - psrlq xmm1,42 - pxor xmm2,xmm0 - psllq xmm0,42 - pxor xmm2,xmm1 - movdqa xmm1,[edx] - pxor xmm2,xmm0 - movdqa xmm0,[96+ebp] - movq mm1,mm4 - paddq xmm6,xmm2 - movq mm7,[edx-32] - pxor mm5,mm6 - psrlq mm1,14 - movq [esp],mm4 - paddq xmm0,xmm6 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [32+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[24+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[56+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[40+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[esp] - paddq mm2,mm6 - movq mm6,[8+esp] - movq mm1,mm4 - movq mm7,[edx-24] - pxor mm5,mm6 - psrlq mm1,14 - movq [56+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [24+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[16+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[48+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[32+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[56+esp] - paddq mm0,mm6 - movq mm6,[esp] - movdqa [edx-32],xmm0 - movdqa xmm2,xmm4 - movdqa xmm0,xmm1 -db 102,15,58,15,207,8 - movdqa [48+edx],xmm3 -db 102,15,58,15,211,8 - movdqa xmm3,xmm1 - psrlq xmm1,7 - paddq xmm7,xmm2 - movdqa xmm2,xmm3 - psrlq xmm3,1 - psllq xmm2,56 - pxor xmm1,xmm3 - psrlq xmm3,7 - pxor xmm1,xmm2 - psllq xmm2,7 - pxor xmm1,xmm3 - movdqa xmm3,xmm6 - pxor xmm1,xmm2 - movdqa xmm2,xmm6 - psrlq xmm3,6 - paddq xmm7,xmm1 - movdqa xmm1,xmm6 - psrlq xmm2,19 - psllq xmm1,3 - pxor xmm3,xmm2 - psrlq xmm2,42 - pxor xmm3,xmm1 - psllq xmm1,42 - pxor xmm3,xmm2 - movdqa xmm2,[16+edx] - pxor xmm3,xmm1 - movdqa xmm1,[112+ebp] - movq mm1,mm4 - paddq xmm7,xmm3 - movq mm7,[edx-16] - pxor mm5,mm6 - psrlq mm1,14 - movq [48+esp],mm4 - paddq xmm1,xmm7 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [16+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[8+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[40+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[24+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[48+esp] - paddq mm2,mm6 - movq mm6,[56+esp] - movq mm1,mm4 - movq mm7,[edx-8] - pxor mm5,mm6 - psrlq mm1,14 - movq [40+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [8+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[32+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[16+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[40+esp] - paddq mm0,mm6 - movq mm6,[48+esp] - movdqa [edx-16],xmm1 - lea ebp,[128+ebp] - dec ecx - jnz NEAR L$00800_47_ssse3 - movdqa xmm1,[ebp] - lea ebp,[ebp-640] - movdqu xmm0,[ebx] -db 102,15,56,0,193 - movdqa xmm3,[ebp] - movdqa xmm2,xmm1 - movdqu xmm1,[16+ebx] - paddq xmm3,xmm0 -db 102,15,56,0,202 - movq mm1,mm4 - movq mm7,[edx-128] - pxor mm5,mm6 - psrlq mm1,14 - movq [32+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[56+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[24+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[8+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[32+esp] - paddq mm2,mm6 - movq mm6,[40+esp] - movq mm1,mm4 - movq mm7,[edx-120] - pxor mm5,mm6 - psrlq mm1,14 - movq [24+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [56+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[48+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[16+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[24+esp] - paddq mm0,mm6 - movq mm6,[32+esp] - movdqa [edx-128],xmm3 - movdqa xmm4,[16+ebp] - movdqa xmm3,xmm2 - movdqu xmm2,[32+ebx] - paddq xmm4,xmm1 -db 102,15,56,0,211 - movq mm1,mm4 - movq mm7,[edx-112] - pxor mm5,mm6 - psrlq mm1,14 - movq [16+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [48+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[40+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[8+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[56+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[16+esp] - paddq mm2,mm6 - movq mm6,[24+esp] - movq mm1,mm4 - movq mm7,[edx-104] - pxor mm5,mm6 - psrlq mm1,14 - movq [8+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [40+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[32+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[48+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[8+esp] - paddq mm0,mm6 - movq mm6,[16+esp] - movdqa [edx-112],xmm4 - movdqa xmm5,[32+ebp] - movdqa xmm4,xmm3 - movdqu xmm3,[48+ebx] - paddq xmm5,xmm2 -db 102,15,56,0,220 - movq mm1,mm4 - movq mm7,[edx-96] - pxor mm5,mm6 - psrlq mm1,14 - movq [esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [32+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[24+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[56+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[40+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[esp] - paddq mm2,mm6 - movq mm6,[8+esp] - movq mm1,mm4 - movq mm7,[edx-88] - pxor mm5,mm6 - psrlq mm1,14 - movq [56+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [24+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[16+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[48+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[32+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[56+esp] - paddq mm0,mm6 - movq mm6,[esp] - movdqa [edx-96],xmm5 - movdqa xmm6,[48+ebp] - movdqa xmm5,xmm4 - movdqu xmm4,[64+ebx] - paddq xmm6,xmm3 -db 102,15,56,0,229 - movq mm1,mm4 - movq mm7,[edx-80] - pxor mm5,mm6 - psrlq mm1,14 - movq [48+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [16+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[8+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[40+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[24+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[48+esp] - paddq mm2,mm6 - movq mm6,[56+esp] - movq mm1,mm4 - movq mm7,[edx-72] - pxor mm5,mm6 - psrlq mm1,14 - movq [40+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [8+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[32+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[16+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[40+esp] - paddq mm0,mm6 - movq mm6,[48+esp] - movdqa [edx-80],xmm6 - movdqa xmm7,[64+ebp] - movdqa xmm6,xmm5 - movdqu xmm5,[80+ebx] - paddq xmm7,xmm4 -db 102,15,56,0,238 - movq mm1,mm4 - movq mm7,[edx-64] - pxor mm5,mm6 - psrlq mm1,14 - movq [32+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[56+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[24+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[8+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[32+esp] - paddq mm2,mm6 - movq mm6,[40+esp] - movq mm1,mm4 - movq mm7,[edx-56] - pxor mm5,mm6 - psrlq mm1,14 - movq [24+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [56+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[48+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[16+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[24+esp] - paddq mm0,mm6 - movq mm6,[32+esp] - movdqa [edx-64],xmm7 - movdqa [edx],xmm0 - movdqa xmm0,[80+ebp] - movdqa xmm7,xmm6 - movdqu xmm6,[96+ebx] - paddq xmm0,xmm5 -db 102,15,56,0,247 - movq mm1,mm4 - movq mm7,[edx-48] - pxor mm5,mm6 - psrlq mm1,14 - movq [16+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [48+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[40+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[8+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[56+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[16+esp] - paddq mm2,mm6 - movq mm6,[24+esp] - movq mm1,mm4 - movq mm7,[edx-40] - pxor mm5,mm6 - psrlq mm1,14 - movq [8+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [40+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[32+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[48+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[8+esp] - paddq mm0,mm6 - movq mm6,[16+esp] - movdqa [edx-48],xmm0 - movdqa [16+edx],xmm1 - movdqa xmm1,[96+ebp] - movdqa xmm0,xmm7 - movdqu xmm7,[112+ebx] - paddq xmm1,xmm6 -db 102,15,56,0,248 - movq mm1,mm4 - movq mm7,[edx-32] - pxor mm5,mm6 - psrlq mm1,14 - movq [esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [32+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[24+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[56+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[40+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[esp] - paddq mm2,mm6 - movq mm6,[8+esp] - movq mm1,mm4 - movq mm7,[edx-24] - pxor mm5,mm6 - psrlq mm1,14 - movq [56+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [24+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[16+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[48+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[32+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[56+esp] - paddq mm0,mm6 - movq mm6,[esp] - movdqa [edx-32],xmm1 - movdqa [32+edx],xmm2 - movdqa xmm2,[112+ebp] - movdqa xmm0,[edx] - paddq xmm2,xmm7 - movq mm1,mm4 - movq mm7,[edx-16] - pxor mm5,mm6 - psrlq mm1,14 - movq [48+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm0,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [16+esp],mm0 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[8+esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[40+esp] - paddq mm3,mm7 - movq mm5,mm0 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm0 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[24+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm0,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm2,mm0 - psllq mm6,6 - pxor mm7,mm5 - pxor mm2,mm1 - pxor mm6,mm7 - movq mm5,[48+esp] - paddq mm2,mm6 - movq mm6,[56+esp] - movq mm1,mm4 - movq mm7,[edx-8] - pxor mm5,mm6 - psrlq mm1,14 - movq [40+esp],mm4 - pand mm5,mm4 - psllq mm4,23 - paddq mm2,mm3 - movq mm3,mm1 - psrlq mm1,4 - pxor mm5,mm6 - pxor mm3,mm4 - psllq mm4,23 - pxor mm3,mm1 - movq [8+esp],mm2 - paddq mm7,mm5 - pxor mm3,mm4 - psrlq mm1,23 - paddq mm7,[esp] - pxor mm3,mm1 - psllq mm4,4 - pxor mm3,mm4 - movq mm4,[32+esp] - paddq mm3,mm7 - movq mm5,mm2 - psrlq mm5,28 - paddq mm4,mm3 - movq mm6,mm2 - movq mm7,mm5 - psllq mm6,25 - movq mm1,[16+esp] - psrlq mm5,6 - pxor mm7,mm6 - psllq mm6,5 - pxor mm7,mm5 - pxor mm2,mm1 - psrlq mm5,5 - pxor mm7,mm6 - pand mm0,mm2 - psllq mm6,6 - pxor mm7,mm5 - pxor mm0,mm1 - pxor mm6,mm7 - movq mm5,[40+esp] - paddq mm0,mm6 - movq mm6,[48+esp] - movdqa [edx-16],xmm2 - movq mm1,[8+esp] - paddq mm0,mm3 - movq mm3,[24+esp] - movq mm7,[56+esp] - pxor mm2,mm1 - paddq mm0,[esi] - paddq mm1,[8+esi] - paddq mm2,[16+esi] - paddq mm3,[24+esi] - paddq mm4,[32+esi] - paddq mm5,[40+esi] - paddq mm6,[48+esi] - paddq mm7,[56+esi] - movq [esi],mm0 - movq [8+esi],mm1 - movq [16+esi],mm2 - movq [24+esi],mm3 - movq [32+esi],mm4 - movq [40+esi],mm5 - movq [48+esi],mm6 - movq [56+esi],mm7 - cmp edi,eax - jb NEAR L$007loop_ssse3 - mov esp,DWORD [76+edx] - emms - pop edi - pop esi - pop ebx - pop ebp - ret -align 16 -L$002loop_x86: - mov eax,DWORD [edi] - mov ebx,DWORD [4+edi] - mov ecx,DWORD [8+edi] - mov edx,DWORD [12+edi] - bswap eax - bswap ebx - bswap ecx - bswap edx - push eax - push ebx - push ecx - push edx - mov eax,DWORD [16+edi] - mov ebx,DWORD [20+edi] - mov ecx,DWORD [24+edi] - mov edx,DWORD [28+edi] - bswap eax - bswap ebx - bswap ecx - bswap edx - push eax - push ebx - push ecx - push edx - mov eax,DWORD [32+edi] - mov ebx,DWORD [36+edi] - mov ecx,DWORD [40+edi] - mov edx,DWORD [44+edi] - bswap eax - bswap ebx - bswap ecx - bswap edx - push eax - push ebx - push ecx - push edx - mov eax,DWORD [48+edi] - mov ebx,DWORD [52+edi] - mov ecx,DWORD [56+edi] - mov edx,DWORD [60+edi] - bswap eax - bswap ebx - bswap ecx - bswap edx - push eax - push ebx - push ecx - push edx - mov eax,DWORD [64+edi] - mov ebx,DWORD [68+edi] - mov ecx,DWORD [72+edi] - mov edx,DWORD [76+edi] - bswap eax - bswap ebx - bswap ecx - bswap edx - push eax - push ebx - push ecx - push edx - mov eax,DWORD [80+edi] - mov ebx,DWORD [84+edi] - mov ecx,DWORD [88+edi] - mov edx,DWORD [92+edi] - bswap eax - bswap ebx - bswap ecx - bswap edx - push eax - push ebx - push ecx - push edx - mov eax,DWORD [96+edi] - mov ebx,DWORD [100+edi] - mov ecx,DWORD [104+edi] - mov edx,DWORD [108+edi] - bswap eax - bswap ebx - bswap ecx - bswap edx - push eax - push ebx - push ecx - push edx - mov eax,DWORD [112+edi] - mov ebx,DWORD [116+edi] - mov ecx,DWORD [120+edi] - mov edx,DWORD [124+edi] - bswap eax - bswap ebx - bswap ecx - bswap edx - push eax - push ebx - push ecx - push edx - add edi,128 - sub esp,72 - mov DWORD [204+esp],edi - lea edi,[8+esp] - mov ecx,16 -dd 2784229001 -align 16 -L$00900_15_x86: - mov ecx,DWORD [40+esp] - mov edx,DWORD [44+esp] - mov esi,ecx - shr ecx,9 - mov edi,edx - shr edx,9 - mov ebx,ecx - shl esi,14 - mov eax,edx - shl edi,14 - xor ebx,esi - shr ecx,5 - xor eax,edi - shr edx,5 - xor eax,ecx - shl esi,4 - xor ebx,edx - shl edi,4 - xor ebx,esi - shr ecx,4 - xor eax,edi - shr edx,4 - xor eax,ecx - shl esi,5 - xor ebx,edx - shl edi,5 - xor eax,esi - xor ebx,edi - mov ecx,DWORD [48+esp] - mov edx,DWORD [52+esp] - mov esi,DWORD [56+esp] - mov edi,DWORD [60+esp] - add eax,DWORD [64+esp] - adc ebx,DWORD [68+esp] - xor ecx,esi - xor edx,edi - and ecx,DWORD [40+esp] - and edx,DWORD [44+esp] - add eax,DWORD [192+esp] - adc ebx,DWORD [196+esp] - xor ecx,esi - xor edx,edi - mov esi,DWORD [ebp] - mov edi,DWORD [4+ebp] - add eax,ecx - adc ebx,edx - mov ecx,DWORD [32+esp] - mov edx,DWORD [36+esp] - add eax,esi - adc ebx,edi - mov DWORD [esp],eax - mov DWORD [4+esp],ebx - add eax,ecx - adc ebx,edx - mov ecx,DWORD [8+esp] - mov edx,DWORD [12+esp] - mov DWORD [32+esp],eax - mov DWORD [36+esp],ebx - mov esi,ecx - shr ecx,2 - mov edi,edx - shr edx,2 - mov ebx,ecx - shl esi,4 - mov eax,edx - shl edi,4 - xor ebx,esi - shr ecx,5 - xor eax,edi - shr edx,5 - xor ebx,ecx - shl esi,21 - xor eax,edx - shl edi,21 - xor eax,esi - shr ecx,21 - xor ebx,edi - shr edx,21 - xor eax,ecx - shl esi,5 - xor ebx,edx - shl edi,5 - xor eax,esi - xor ebx,edi - mov ecx,DWORD [8+esp] - mov edx,DWORD [12+esp] - mov esi,DWORD [16+esp] - mov edi,DWORD [20+esp] - add eax,DWORD [esp] - adc ebx,DWORD [4+esp] - or ecx,esi - or edx,edi - and ecx,DWORD [24+esp] - and edx,DWORD [28+esp] - and esi,DWORD [8+esp] - and edi,DWORD [12+esp] - or ecx,esi - or edx,edi - add eax,ecx - adc ebx,edx - mov DWORD [esp],eax - mov DWORD [4+esp],ebx - mov dl,BYTE [ebp] - sub esp,8 - lea ebp,[8+ebp] - cmp dl,148 - jne NEAR L$00900_15_x86 -align 16 -L$01016_79_x86: - mov ecx,DWORD [312+esp] - mov edx,DWORD [316+esp] - mov esi,ecx - shr ecx,1 - mov edi,edx - shr edx,1 - mov eax,ecx - shl esi,24 - mov ebx,edx - shl edi,24 - xor ebx,esi - shr ecx,6 - xor eax,edi - shr edx,6 - xor eax,ecx - shl esi,7 - xor ebx,edx - shl edi,1 - xor ebx,esi - shr ecx,1 - xor eax,edi - shr edx,1 - xor eax,ecx - shl edi,6 - xor ebx,edx - xor eax,edi - mov DWORD [esp],eax - mov DWORD [4+esp],ebx - mov ecx,DWORD [208+esp] - mov edx,DWORD [212+esp] - mov esi,ecx - shr ecx,6 - mov edi,edx - shr edx,6 - mov eax,ecx - shl esi,3 - mov ebx,edx - shl edi,3 - xor eax,esi - shr ecx,13 - xor ebx,edi - shr edx,13 - xor eax,ecx - shl esi,10 - xor ebx,edx - shl edi,10 - xor ebx,esi - shr ecx,10 - xor eax,edi - shr edx,10 - xor ebx,ecx - shl edi,13 - xor eax,edx - xor eax,edi - mov ecx,DWORD [320+esp] - mov edx,DWORD [324+esp] - add eax,DWORD [esp] - adc ebx,DWORD [4+esp] - mov esi,DWORD [248+esp] - mov edi,DWORD [252+esp] - add eax,ecx - adc ebx,edx - add eax,esi - adc ebx,edi - mov DWORD [192+esp],eax - mov DWORD [196+esp],ebx - mov ecx,DWORD [40+esp] - mov edx,DWORD [44+esp] - mov esi,ecx - shr ecx,9 - mov edi,edx - shr edx,9 - mov ebx,ecx - shl esi,14 - mov eax,edx - shl edi,14 - xor ebx,esi - shr ecx,5 - xor eax,edi - shr edx,5 - xor eax,ecx - shl esi,4 - xor ebx,edx - shl edi,4 - xor ebx,esi - shr ecx,4 - xor eax,edi - shr edx,4 - xor eax,ecx - shl esi,5 - xor ebx,edx - shl edi,5 - xor eax,esi - xor ebx,edi - mov ecx,DWORD [48+esp] - mov edx,DWORD [52+esp] - mov esi,DWORD [56+esp] - mov edi,DWORD [60+esp] - add eax,DWORD [64+esp] - adc ebx,DWORD [68+esp] - xor ecx,esi - xor edx,edi - and ecx,DWORD [40+esp] - and edx,DWORD [44+esp] - add eax,DWORD [192+esp] - adc ebx,DWORD [196+esp] - xor ecx,esi - xor edx,edi - mov esi,DWORD [ebp] - mov edi,DWORD [4+ebp] - add eax,ecx - adc ebx,edx - mov ecx,DWORD [32+esp] - mov edx,DWORD [36+esp] - add eax,esi - adc ebx,edi - mov DWORD [esp],eax - mov DWORD [4+esp],ebx - add eax,ecx - adc ebx,edx - mov ecx,DWORD [8+esp] - mov edx,DWORD [12+esp] - mov DWORD [32+esp],eax - mov DWORD [36+esp],ebx - mov esi,ecx - shr ecx,2 - mov edi,edx - shr edx,2 - mov ebx,ecx - shl esi,4 - mov eax,edx - shl edi,4 - xor ebx,esi - shr ecx,5 - xor eax,edi - shr edx,5 - xor ebx,ecx - shl esi,21 - xor eax,edx - shl edi,21 - xor eax,esi - shr ecx,21 - xor ebx,edi - shr edx,21 - xor eax,ecx - shl esi,5 - xor ebx,edx - shl edi,5 - xor eax,esi - xor ebx,edi - mov ecx,DWORD [8+esp] - mov edx,DWORD [12+esp] - mov esi,DWORD [16+esp] - mov edi,DWORD [20+esp] - add eax,DWORD [esp] - adc ebx,DWORD [4+esp] - or ecx,esi - or edx,edi - and ecx,DWORD [24+esp] - and edx,DWORD [28+esp] - and esi,DWORD [8+esp] - and edi,DWORD [12+esp] - or ecx,esi - or edx,edi - add eax,ecx - adc ebx,edx - mov DWORD [esp],eax - mov DWORD [4+esp],ebx - mov dl,BYTE [ebp] - sub esp,8 - lea ebp,[8+ebp] - cmp dl,23 - jne NEAR L$01016_79_x86 - mov esi,DWORD [840+esp] - mov edi,DWORD [844+esp] - mov eax,DWORD [esi] - mov ebx,DWORD [4+esi] - mov ecx,DWORD [8+esi] - mov edx,DWORD [12+esi] - add eax,DWORD [8+esp] - adc ebx,DWORD [12+esp] - mov DWORD [esi],eax - mov DWORD [4+esi],ebx - add ecx,DWORD [16+esp] - adc edx,DWORD [20+esp] - mov DWORD [8+esi],ecx - mov DWORD [12+esi],edx - mov eax,DWORD [16+esi] - mov ebx,DWORD [20+esi] - mov ecx,DWORD [24+esi] - mov edx,DWORD [28+esi] - add eax,DWORD [24+esp] - adc ebx,DWORD [28+esp] - mov DWORD [16+esi],eax - mov DWORD [20+esi],ebx - add ecx,DWORD [32+esp] - adc edx,DWORD [36+esp] - mov DWORD [24+esi],ecx - mov DWORD [28+esi],edx - mov eax,DWORD [32+esi] - mov ebx,DWORD [36+esi] - mov ecx,DWORD [40+esi] - mov edx,DWORD [44+esi] - add eax,DWORD [40+esp] - adc ebx,DWORD [44+esp] - mov DWORD [32+esi],eax - mov DWORD [36+esi],ebx - add ecx,DWORD [48+esp] - adc edx,DWORD [52+esp] - mov DWORD [40+esi],ecx - mov DWORD [44+esi],edx - mov eax,DWORD [48+esi] - mov ebx,DWORD [52+esi] - mov ecx,DWORD [56+esi] - mov edx,DWORD [60+esi] - add eax,DWORD [56+esp] - adc ebx,DWORD [60+esp] - mov DWORD [48+esi],eax - mov DWORD [52+esi],ebx - add ecx,DWORD [64+esp] - adc edx,DWORD [68+esp] - mov DWORD [56+esi],ecx - mov DWORD [60+esi],edx - add esp,840 - sub ebp,640 - cmp edi,DWORD [8+esp] - jb NEAR L$002loop_x86 - mov esp,DWORD [12+esp] - pop edi - pop esi - pop ebx - pop ebp - ret -align 64 -L$001K512: -dd 3609767458,1116352408 -dd 602891725,1899447441 -dd 3964484399,3049323471 -dd 2173295548,3921009573 -dd 4081628472,961987163 -dd 3053834265,1508970993 -dd 2937671579,2453635748 -dd 3664609560,2870763221 -dd 2734883394,3624381080 -dd 1164996542,310598401 -dd 1323610764,607225278 -dd 3590304994,1426881987 -dd 4068182383,1925078388 -dd 991336113,2162078206 -dd 633803317,2614888103 -dd 3479774868,3248222580 -dd 2666613458,3835390401 -dd 944711139,4022224774 -dd 2341262773,264347078 -dd 2007800933,604807628 -dd 1495990901,770255983 -dd 1856431235,1249150122 -dd 3175218132,1555081692 -dd 2198950837,1996064986 -dd 3999719339,2554220882 -dd 766784016,2821834349 -dd 2566594879,2952996808 -dd 3203337956,3210313671 -dd 1034457026,3336571891 -dd 2466948901,3584528711 -dd 3758326383,113926993 -dd 168717936,338241895 -dd 1188179964,666307205 -dd 1546045734,773529912 -dd 1522805485,1294757372 -dd 2643833823,1396182291 -dd 2343527390,1695183700 -dd 1014477480,1986661051 -dd 1206759142,2177026350 -dd 344077627,2456956037 -dd 1290863460,2730485921 -dd 3158454273,2820302411 -dd 3505952657,3259730800 -dd 106217008,3345764771 -dd 3606008344,3516065817 -dd 1432725776,3600352804 -dd 1467031594,4094571909 -dd 851169720,275423344 -dd 3100823752,430227734 -dd 1363258195,506948616 -dd 3750685593,659060556 -dd 3785050280,883997877 -dd 3318307427,958139571 -dd 3812723403,1322822218 -dd 2003034995,1537002063 -dd 3602036899,1747873779 -dd 1575990012,1955562222 -dd 1125592928,2024104815 -dd 2716904306,2227730452 -dd 442776044,2361852424 -dd 593698344,2428436474 -dd 3733110249,2756734187 -dd 2999351573,3204031479 -dd 3815920427,3329325298 -dd 3928383900,3391569614 -dd 566280711,3515267271 -dd 3454069534,3940187606 -dd 4000239992,4118630271 -dd 1914138554,116418474 -dd 2731055270,174292421 -dd 3203993006,289380356 -dd 320620315,460393269 -dd 587496836,685471733 -dd 1086792851,852142971 -dd 365543100,1017036298 -dd 2618297676,1126000580 -dd 3409855158,1288033470 -dd 4234509866,1501505948 -dd 987167468,1607167915 -dd 1246189591,1816402316 -dd 67438087,66051 -dd 202182159,134810123 -db 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 -db 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 -db 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 -db 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 -db 62,0 -segment .bss -common _OPENSSL_ia32cap_P 16 -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-ios.ios.arm.S deleted file mode 100644 index c587ac01..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-ios.ios.arm.S +++ /dev/null @@ -1,1866 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__APPLE__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__) -@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. -@ -@ Licensed under the OpenSSL license (the "License"). You may not use -@ this file except in compliance with the License. You can obtain a copy -@ in the file LICENSE in the source distribution or at -@ https://www.openssl.org/source/license.html - - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ -@ Permission to use under GPL terms is granted. -@ ==================================================================== - -@ SHA512 block procedure for ARMv4. September 2007. - -@ This code is ~4.5 (four and a half) times faster than code generated -@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue -@ Xscale PXA250 core]. -@ -@ July 2010. -@ -@ Rescheduling for dual-issue pipeline resulted in 6% improvement on -@ Cortex A8 core and ~40 cycles per processed byte. - -@ February 2011. -@ -@ Profiler-assisted and platform-specific optimization resulted in 7% -@ improvement on Coxtex A8 core and ~38 cycles per byte. - -@ March 2011. -@ -@ Add NEON implementation. On Cortex A8 it was measured to process -@ one byte in 23.3 cycles or ~60% faster than integer-only code. - -@ August 2012. -@ -@ Improve NEON performance by 12% on Snapdragon S4. In absolute -@ terms it's 22.6 cycles per byte, which is disappointing result. -@ Technical writers asserted that 3-way S4 pipeline can sustain -@ multiple NEON instructions per cycle, but dual NEON issue could -@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html -@ for further details. On side note Cortex-A15 processes one byte in -@ 16 cycles. - -@ Byte order [in]dependence. ========================================= -@ -@ Originally caller was expected to maintain specific *dword* order in -@ h[0-7], namely with most significant dword at *lower* address, which -@ was reflected in below two parameters as 0 and 4. Now caller is -@ expected to maintain native byte order for whole 64-bit values. -#ifndef __KERNEL__ -# include -# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} -# define VFP_ABI_POP vldmia sp!,{d8-d15} -#else -# define __ARM_MAX_ARCH__ 7 -# define VFP_ABI_PUSH -# define VFP_ABI_POP -#endif - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. - - -#ifdef __ARMEL__ -# define LO 0 -# define HI 4 -# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 -#else -# define HI 0 -# define LO 4 -# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 -#endif - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -# define adrl adr -#else -.code 32 -#endif - - -.align 5 -K512: - WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) - WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) - WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) - WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) - WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) - WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) - WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) - WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) - WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) - WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) - WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) - WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) - WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) - WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) - WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) - WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) - WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) - WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) - WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) - WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) - WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) - WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) - WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) - WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) - WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) - WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) - WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) - WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) - WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) - WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) - WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) - WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) - WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) - WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) - WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) - WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) - WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) - WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) - WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) - WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) - - -.globl _sha512_block_data_order_nohw -.private_extern _sha512_block_data_order_nohw -#ifdef __thumb2__ -.thumb_func _sha512_block_data_order_nohw -#endif -_sha512_block_data_order_nohw: - add r2,r1,r2,lsl#7 @ len to point at the end of inp - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - adr r14,K512 - sub sp,sp,#9*8 - - ldr r7,[r0,#32+LO] - ldr r8,[r0,#32+HI] - ldr r9, [r0,#48+LO] - ldr r10, [r0,#48+HI] - ldr r11, [r0,#56+LO] - ldr r12, [r0,#56+HI] -Loop: - str r9, [sp,#48+0] - str r10, [sp,#48+4] - str r11, [sp,#56+0] - str r12, [sp,#56+4] - ldr r5,[r0,#0+LO] - ldr r6,[r0,#0+HI] - ldr r3,[r0,#8+LO] - ldr r4,[r0,#8+HI] - ldr r9, [r0,#16+LO] - ldr r10, [r0,#16+HI] - ldr r11, [r0,#24+LO] - ldr r12, [r0,#24+HI] - str r3,[sp,#8+0] - str r4,[sp,#8+4] - str r9, [sp,#16+0] - str r10, [sp,#16+4] - str r11, [sp,#24+0] - str r12, [sp,#24+4] - ldr r3,[r0,#40+LO] - ldr r4,[r0,#40+HI] - str r3,[sp,#40+0] - str r4,[sp,#40+4] - -L00_15: -#if __ARM_ARCH<7 - ldrb r3,[r1,#7] - ldrb r9, [r1,#6] - ldrb r10, [r1,#5] - ldrb r11, [r1,#4] - ldrb r4,[r1,#3] - ldrb r12, [r1,#2] - orr r3,r3,r9,lsl#8 - ldrb r9, [r1,#1] - orr r3,r3,r10,lsl#16 - ldrb r10, [r1],#8 - orr r3,r3,r11,lsl#24 - orr r4,r4,r12,lsl#8 - orr r4,r4,r9,lsl#16 - orr r4,r4,r10,lsl#24 -#else - ldr r3,[r1,#4] - ldr r4,[r1],#8 -#ifdef __ARMEL__ - rev r3,r3 - rev r4,r4 -#endif -#endif - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov r9,r7,lsr#14 - str r3,[sp,#64+0] - mov r10,r8,lsr#14 - str r4,[sp,#64+4] - eor r9,r9,r8,lsl#18 - ldr r11,[sp,#56+0] @ h.lo - eor r10,r10,r7,lsl#18 - ldr r12,[sp,#56+4] @ h.hi - eor r9,r9,r7,lsr#18 - eor r10,r10,r8,lsr#18 - eor r9,r9,r8,lsl#14 - eor r10,r10,r7,lsl#14 - eor r9,r9,r8,lsr#9 - eor r10,r10,r7,lsr#9 - eor r9,r9,r7,lsl#23 - eor r10,r10,r8,lsl#23 @ Sigma1(e) - adds r3,r3,r9 - ldr r9,[sp,#40+0] @ f.lo - adc r4,r4,r10 @ T += Sigma1(e) - ldr r10,[sp,#40+4] @ f.hi - adds r3,r3,r11 - ldr r11,[sp,#48+0] @ g.lo - adc r4,r4,r12 @ T += h - ldr r12,[sp,#48+4] @ g.hi - - eor r9,r9,r11 - str r7,[sp,#32+0] - eor r10,r10,r12 - str r8,[sp,#32+4] - and r9,r9,r7 - str r5,[sp,#0+0] - and r10,r10,r8 - str r6,[sp,#0+4] - eor r9,r9,r11 - ldr r11,[r14,#LO] @ K[i].lo - eor r10,r10,r12 @ Ch(e,f,g) - ldr r12,[r14,#HI] @ K[i].hi - - adds r3,r3,r9 - ldr r7,[sp,#24+0] @ d.lo - adc r4,r4,r10 @ T += Ch(e,f,g) - ldr r8,[sp,#24+4] @ d.hi - adds r3,r3,r11 - and r9,r11,#0xff - adc r4,r4,r12 @ T += K[i] - adds r7,r7,r3 - ldr r11,[sp,#8+0] @ b.lo - adc r8,r8,r4 @ d += T - teq r9,#148 - - ldr r12,[sp,#16+0] @ c.lo -#if __ARM_ARCH>=7 - it eq @ Thumb2 thing, sanity check in ARM -#endif - orreq r14,r14,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov r9,r5,lsr#28 - mov r10,r6,lsr#28 - eor r9,r9,r6,lsl#4 - eor r10,r10,r5,lsl#4 - eor r9,r9,r6,lsr#2 - eor r10,r10,r5,lsr#2 - eor r9,r9,r5,lsl#30 - eor r10,r10,r6,lsl#30 - eor r9,r9,r6,lsr#7 - eor r10,r10,r5,lsr#7 - eor r9,r9,r5,lsl#25 - eor r10,r10,r6,lsl#25 @ Sigma0(a) - adds r3,r3,r9 - and r9,r5,r11 - adc r4,r4,r10 @ T += Sigma0(a) - - ldr r10,[sp,#8+4] @ b.hi - orr r5,r5,r11 - ldr r11,[sp,#16+4] @ c.hi - and r5,r5,r12 - and r12,r6,r10 - orr r6,r6,r10 - orr r5,r5,r9 @ Maj(a,b,c).lo - and r6,r6,r11 - adds r5,r5,r3 - orr r6,r6,r12 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc r6,r6,r4 @ h += T - tst r14,#1 - add r14,r14,#8 - tst r14,#1 - beq L00_15 - ldr r9,[sp,#184+0] - ldr r10,[sp,#184+4] - bic r14,r14,#1 -L16_79: - @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) - @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 - @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 - mov r3,r9,lsr#1 - ldr r11,[sp,#80+0] - mov r4,r10,lsr#1 - ldr r12,[sp,#80+4] - eor r3,r3,r10,lsl#31 - eor r4,r4,r9,lsl#31 - eor r3,r3,r9,lsr#8 - eor r4,r4,r10,lsr#8 - eor r3,r3,r10,lsl#24 - eor r4,r4,r9,lsl#24 - eor r3,r3,r9,lsr#7 - eor r4,r4,r10,lsr#7 - eor r3,r3,r10,lsl#25 - - @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) - @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 - @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 - mov r9,r11,lsr#19 - mov r10,r12,lsr#19 - eor r9,r9,r12,lsl#13 - eor r10,r10,r11,lsl#13 - eor r9,r9,r12,lsr#29 - eor r10,r10,r11,lsr#29 - eor r9,r9,r11,lsl#3 - eor r10,r10,r12,lsl#3 - eor r9,r9,r11,lsr#6 - eor r10,r10,r12,lsr#6 - ldr r11,[sp,#120+0] - eor r9,r9,r12,lsl#26 - - ldr r12,[sp,#120+4] - adds r3,r3,r9 - ldr r9,[sp,#192+0] - adc r4,r4,r10 - - ldr r10,[sp,#192+4] - adds r3,r3,r11 - adc r4,r4,r12 - adds r3,r3,r9 - adc r4,r4,r10 - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov r9,r7,lsr#14 - str r3,[sp,#64+0] - mov r10,r8,lsr#14 - str r4,[sp,#64+4] - eor r9,r9,r8,lsl#18 - ldr r11,[sp,#56+0] @ h.lo - eor r10,r10,r7,lsl#18 - ldr r12,[sp,#56+4] @ h.hi - eor r9,r9,r7,lsr#18 - eor r10,r10,r8,lsr#18 - eor r9,r9,r8,lsl#14 - eor r10,r10,r7,lsl#14 - eor r9,r9,r8,lsr#9 - eor r10,r10,r7,lsr#9 - eor r9,r9,r7,lsl#23 - eor r10,r10,r8,lsl#23 @ Sigma1(e) - adds r3,r3,r9 - ldr r9,[sp,#40+0] @ f.lo - adc r4,r4,r10 @ T += Sigma1(e) - ldr r10,[sp,#40+4] @ f.hi - adds r3,r3,r11 - ldr r11,[sp,#48+0] @ g.lo - adc r4,r4,r12 @ T += h - ldr r12,[sp,#48+4] @ g.hi - - eor r9,r9,r11 - str r7,[sp,#32+0] - eor r10,r10,r12 - str r8,[sp,#32+4] - and r9,r9,r7 - str r5,[sp,#0+0] - and r10,r10,r8 - str r6,[sp,#0+4] - eor r9,r9,r11 - ldr r11,[r14,#LO] @ K[i].lo - eor r10,r10,r12 @ Ch(e,f,g) - ldr r12,[r14,#HI] @ K[i].hi - - adds r3,r3,r9 - ldr r7,[sp,#24+0] @ d.lo - adc r4,r4,r10 @ T += Ch(e,f,g) - ldr r8,[sp,#24+4] @ d.hi - adds r3,r3,r11 - and r9,r11,#0xff - adc r4,r4,r12 @ T += K[i] - adds r7,r7,r3 - ldr r11,[sp,#8+0] @ b.lo - adc r8,r8,r4 @ d += T - teq r9,#23 - - ldr r12,[sp,#16+0] @ c.lo -#if __ARM_ARCH>=7 - it eq @ Thumb2 thing, sanity check in ARM -#endif - orreq r14,r14,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov r9,r5,lsr#28 - mov r10,r6,lsr#28 - eor r9,r9,r6,lsl#4 - eor r10,r10,r5,lsl#4 - eor r9,r9,r6,lsr#2 - eor r10,r10,r5,lsr#2 - eor r9,r9,r5,lsl#30 - eor r10,r10,r6,lsl#30 - eor r9,r9,r6,lsr#7 - eor r10,r10,r5,lsr#7 - eor r9,r9,r5,lsl#25 - eor r10,r10,r6,lsl#25 @ Sigma0(a) - adds r3,r3,r9 - and r9,r5,r11 - adc r4,r4,r10 @ T += Sigma0(a) - - ldr r10,[sp,#8+4] @ b.hi - orr r5,r5,r11 - ldr r11,[sp,#16+4] @ c.hi - and r5,r5,r12 - and r12,r6,r10 - orr r6,r6,r10 - orr r5,r5,r9 @ Maj(a,b,c).lo - and r6,r6,r11 - adds r5,r5,r3 - orr r6,r6,r12 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc r6,r6,r4 @ h += T - tst r14,#1 - add r14,r14,#8 -#if __ARM_ARCH>=7 - ittt eq @ Thumb2 thing, sanity check in ARM -#endif - ldreq r9,[sp,#184+0] - ldreq r10,[sp,#184+4] - beq L16_79 - bic r14,r14,#1 - - ldr r3,[sp,#8+0] - ldr r4,[sp,#8+4] - ldr r9, [r0,#0+LO] - ldr r10, [r0,#0+HI] - ldr r11, [r0,#8+LO] - ldr r12, [r0,#8+HI] - adds r9,r5,r9 - str r9, [r0,#0+LO] - adc r10,r6,r10 - str r10, [r0,#0+HI] - adds r11,r3,r11 - str r11, [r0,#8+LO] - adc r12,r4,r12 - str r12, [r0,#8+HI] - - ldr r5,[sp,#16+0] - ldr r6,[sp,#16+4] - ldr r3,[sp,#24+0] - ldr r4,[sp,#24+4] - ldr r9, [r0,#16+LO] - ldr r10, [r0,#16+HI] - ldr r11, [r0,#24+LO] - ldr r12, [r0,#24+HI] - adds r9,r5,r9 - str r9, [r0,#16+LO] - adc r10,r6,r10 - str r10, [r0,#16+HI] - adds r11,r3,r11 - str r11, [r0,#24+LO] - adc r12,r4,r12 - str r12, [r0,#24+HI] - - ldr r3,[sp,#40+0] - ldr r4,[sp,#40+4] - ldr r9, [r0,#32+LO] - ldr r10, [r0,#32+HI] - ldr r11, [r0,#40+LO] - ldr r12, [r0,#40+HI] - adds r7,r7,r9 - str r7,[r0,#32+LO] - adc r8,r8,r10 - str r8,[r0,#32+HI] - adds r11,r3,r11 - str r11, [r0,#40+LO] - adc r12,r4,r12 - str r12, [r0,#40+HI] - - ldr r5,[sp,#48+0] - ldr r6,[sp,#48+4] - ldr r3,[sp,#56+0] - ldr r4,[sp,#56+4] - ldr r9, [r0,#48+LO] - ldr r10, [r0,#48+HI] - ldr r11, [r0,#56+LO] - ldr r12, [r0,#56+HI] - adds r9,r5,r9 - str r9, [r0,#48+LO] - adc r10,r6,r10 - str r10, [r0,#48+HI] - adds r11,r3,r11 - str r11, [r0,#56+LO] - adc r12,r4,r12 - str r12, [r0,#56+HI] - - add sp,sp,#640 - sub r14,r14,#640 - - teq r1,r2 - bne Loop - - add sp,sp,#8*9 @ destroy frame -#if __ARM_ARCH>=5 - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} -#else - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif - -#if __ARM_MAX_ARCH__>=7 - - - -.globl _sha512_block_data_order_neon -.private_extern _sha512_block_data_order_neon -#ifdef __thumb2__ -.thumb_func _sha512_block_data_order_neon -#endif -.align 4 -_sha512_block_data_order_neon: - dmb @ errata #451034 on early Cortex A8 - add r2,r1,r2,lsl#7 @ len to point at the end of inp - adr r3,K512 - VFP_ABI_PUSH - vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context -Loop_neon: - vshr.u64 d24,d20,#14 @ 0 -#if 0<16 - vld1.64 {d0},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d20,#18 -#if 0>0 - vadd.i64 d16,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d20,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 0<16 && defined(__ARMEL__) - vrev64.8 d0,d0 -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d0 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 1 -#if 1<16 - vld1.64 {d1},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 1>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 1<16 && defined(__ARMEL__) - vrev64.8 d1,d1 -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d1 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 d24,d18,#14 @ 2 -#if 2<16 - vld1.64 {d2},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d18,#18 -#if 2>0 - vadd.i64 d22,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d18,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 2<16 && defined(__ARMEL__) - vrev64.8 d2,d2 -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d2 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 3 -#if 3<16 - vld1.64 {d3},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 3>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 3<16 && defined(__ARMEL__) - vrev64.8 d3,d3 -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d3 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 d24,d16,#14 @ 4 -#if 4<16 - vld1.64 {d4},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d16,#18 -#if 4>0 - vadd.i64 d20,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d16,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 4<16 && defined(__ARMEL__) - vrev64.8 d4,d4 -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d4 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 5 -#if 5<16 - vld1.64 {d5},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 5>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 5<16 && defined(__ARMEL__) - vrev64.8 d5,d5 -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d5 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 d24,d22,#14 @ 6 -#if 6<16 - vld1.64 {d6},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d22,#18 -#if 6>0 - vadd.i64 d18,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d22,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 6<16 && defined(__ARMEL__) - vrev64.8 d6,d6 -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d6 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 7 -#if 7<16 - vld1.64 {d7},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 7>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 7<16 && defined(__ARMEL__) - vrev64.8 d7,d7 -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d7 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - vshr.u64 d24,d20,#14 @ 8 -#if 8<16 - vld1.64 {d8},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d20,#18 -#if 8>0 - vadd.i64 d16,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d20,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 8<16 && defined(__ARMEL__) - vrev64.8 d8,d8 -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d8 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 9 -#if 9<16 - vld1.64 {d9},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 9>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 9<16 && defined(__ARMEL__) - vrev64.8 d9,d9 -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d9 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 d24,d18,#14 @ 10 -#if 10<16 - vld1.64 {d10},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d18,#18 -#if 10>0 - vadd.i64 d22,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d18,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 10<16 && defined(__ARMEL__) - vrev64.8 d10,d10 -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d10 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 11 -#if 11<16 - vld1.64 {d11},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 11>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 11<16 && defined(__ARMEL__) - vrev64.8 d11,d11 -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d11 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 d24,d16,#14 @ 12 -#if 12<16 - vld1.64 {d12},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d16,#18 -#if 12>0 - vadd.i64 d20,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d16,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 12<16 && defined(__ARMEL__) - vrev64.8 d12,d12 -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d12 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 13 -#if 13<16 - vld1.64 {d13},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 13>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 13<16 && defined(__ARMEL__) - vrev64.8 d13,d13 -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d13 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 d24,d22,#14 @ 14 -#if 14<16 - vld1.64 {d14},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d22,#18 -#if 14>0 - vadd.i64 d18,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d22,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 14<16 && defined(__ARMEL__) - vrev64.8 d14,d14 -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d14 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 15 -#if 15<16 - vld1.64 {d15},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 15>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 15<16 && defined(__ARMEL__) - vrev64.8 d15,d15 -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d15 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - mov r12,#4 -L16_79_neon: - subs r12,#1 - vshr.u64 q12,q7,#19 - vshr.u64 q13,q7,#61 - vadd.i64 d16,d30 @ h+=Maj from the past - vshr.u64 q15,q7,#6 - vsli.64 q12,q7,#45 - vext.8 q14,q0,q1,#8 @ X[i+1] - vsli.64 q13,q7,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q0,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q4,q5,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d20,#14 @ from NEON_00_15 - vadd.i64 q0,q14 - vshr.u64 d25,d20,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d20,#41 @ from NEON_00_15 - vadd.i64 q0,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 16<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d0 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 17 -#if 17<16 - vld1.64 {d1},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 17>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 17<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d1 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 q12,q0,#19 - vshr.u64 q13,q0,#61 - vadd.i64 d22,d30 @ h+=Maj from the past - vshr.u64 q15,q0,#6 - vsli.64 q12,q0,#45 - vext.8 q14,q1,q2,#8 @ X[i+1] - vsli.64 q13,q0,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q1,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q5,q6,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d18,#14 @ from NEON_00_15 - vadd.i64 q1,q14 - vshr.u64 d25,d18,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d18,#41 @ from NEON_00_15 - vadd.i64 q1,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 18<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d2 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 19 -#if 19<16 - vld1.64 {d3},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 19>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 19<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d3 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 q12,q1,#19 - vshr.u64 q13,q1,#61 - vadd.i64 d20,d30 @ h+=Maj from the past - vshr.u64 q15,q1,#6 - vsli.64 q12,q1,#45 - vext.8 q14,q2,q3,#8 @ X[i+1] - vsli.64 q13,q1,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q2,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q6,q7,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d16,#14 @ from NEON_00_15 - vadd.i64 q2,q14 - vshr.u64 d25,d16,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d16,#41 @ from NEON_00_15 - vadd.i64 q2,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 20<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d4 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 21 -#if 21<16 - vld1.64 {d5},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 21>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 21<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d5 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 q12,q2,#19 - vshr.u64 q13,q2,#61 - vadd.i64 d18,d30 @ h+=Maj from the past - vshr.u64 q15,q2,#6 - vsli.64 q12,q2,#45 - vext.8 q14,q3,q4,#8 @ X[i+1] - vsli.64 q13,q2,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q3,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q7,q0,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d22,#14 @ from NEON_00_15 - vadd.i64 q3,q14 - vshr.u64 d25,d22,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d22,#41 @ from NEON_00_15 - vadd.i64 q3,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 22<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d6 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 23 -#if 23<16 - vld1.64 {d7},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 23>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 23<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d7 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - vshr.u64 q12,q3,#19 - vshr.u64 q13,q3,#61 - vadd.i64 d16,d30 @ h+=Maj from the past - vshr.u64 q15,q3,#6 - vsli.64 q12,q3,#45 - vext.8 q14,q4,q5,#8 @ X[i+1] - vsli.64 q13,q3,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q4,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q0,q1,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d20,#14 @ from NEON_00_15 - vadd.i64 q4,q14 - vshr.u64 d25,d20,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d20,#41 @ from NEON_00_15 - vadd.i64 q4,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 24<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d8 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 25 -#if 25<16 - vld1.64 {d9},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 25>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 25<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d9 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 q12,q4,#19 - vshr.u64 q13,q4,#61 - vadd.i64 d22,d30 @ h+=Maj from the past - vshr.u64 q15,q4,#6 - vsli.64 q12,q4,#45 - vext.8 q14,q5,q6,#8 @ X[i+1] - vsli.64 q13,q4,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q5,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q1,q2,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d18,#14 @ from NEON_00_15 - vadd.i64 q5,q14 - vshr.u64 d25,d18,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d18,#41 @ from NEON_00_15 - vadd.i64 q5,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 26<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d10 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 27 -#if 27<16 - vld1.64 {d11},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 27>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 27<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d11 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 q12,q5,#19 - vshr.u64 q13,q5,#61 - vadd.i64 d20,d30 @ h+=Maj from the past - vshr.u64 q15,q5,#6 - vsli.64 q12,q5,#45 - vext.8 q14,q6,q7,#8 @ X[i+1] - vsli.64 q13,q5,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q6,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q2,q3,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d16,#14 @ from NEON_00_15 - vadd.i64 q6,q14 - vshr.u64 d25,d16,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d16,#41 @ from NEON_00_15 - vadd.i64 q6,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 28<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d12 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 29 -#if 29<16 - vld1.64 {d13},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 29>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 29<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d13 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 q12,q6,#19 - vshr.u64 q13,q6,#61 - vadd.i64 d18,d30 @ h+=Maj from the past - vshr.u64 q15,q6,#6 - vsli.64 q12,q6,#45 - vext.8 q14,q7,q0,#8 @ X[i+1] - vsli.64 q13,q6,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q7,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q3,q4,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d22,#14 @ from NEON_00_15 - vadd.i64 q7,q14 - vshr.u64 d25,d22,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d22,#41 @ from NEON_00_15 - vadd.i64 q7,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 30<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d14 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 31 -#if 31<16 - vld1.64 {d15},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 31>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 31<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d15 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - bne L16_79_neon - - vadd.i64 d16,d30 @ h+=Maj from the past - vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp - vadd.i64 q8,q12 @ vectorized accumulate - vadd.i64 q9,q13 - vadd.i64 q10,q14 - vadd.i64 q11,q15 - vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context - teq r1,r2 - sub r3,#640 @ rewind K512 - bne Loop_neon - - VFP_ABI_POP - bx lr @ .word 0xe12fff1e - -#endif -.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__) -#endif // defined(__arm__) && defined(__APPLE__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c.inc similarity index 97% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c.inc index 67c2ec7b..fb8b5473 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c +++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c.inc @@ -189,6 +189,7 @@ int CRYPTO_tls13_hkdf_expand_label(uint8_t *out, size_t out_len, uint8_t *hkdf_label = NULL; size_t hkdf_label_len; + FIPS_service_indicator_lock_state(); CBB_zero(&cbb); if (!CBB_init(&cbb, 2 + 1 + sizeof(kProtocolLabel) - 1 + label_len + 1 + hash_len) || @@ -200,12 +201,18 @@ int CRYPTO_tls13_hkdf_expand_label(uint8_t *out, size_t out_len, !CBB_add_bytes(&child, hash, hash_len) || !CBB_finish(&cbb, &hkdf_label, &hkdf_label_len)) { CBB_cleanup(&cbb); + FIPS_service_indicator_unlock_state(); return 0; } const int ret = HKDF_expand(out, out_len, digest, secret, secret_len, hkdf_label, hkdf_label_len); OPENSSL_free(hkdf_label); + + FIPS_service_indicator_unlock_state(); + if (ret) { + TLSKDF_verify_service_indicator(digest); + } return ret; } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-ios.ios.arm.S deleted file mode 100644 index d0f196ed..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-ios.ios.arm.S +++ /dev/null @@ -1,1264 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__APPLE__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__) -.syntax unified - - - - -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif - -.text - - -.align 7 @ totally strategic alignment -_vpaes_consts: -Lk_mc_forward:@ mc_forward -.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 -.quad 0x080B0A0904070605, 0x000302010C0F0E0D -.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 -.quad 0x000302010C0F0E0D, 0x080B0A0904070605 -Lk_mc_backward:@ mc_backward -.quad 0x0605040702010003, 0x0E0D0C0F0A09080B -.quad 0x020100030E0D0C0F, 0x0A09080B06050407 -.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 -.quad 0x0A09080B06050407, 0x020100030E0D0C0F -Lk_sr:@ sr -.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 -.quad 0x030E09040F0A0500, 0x0B06010C07020D08 -.quad 0x0F060D040B020900, 0x070E050C030A0108 -.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 - -@ -@ "Hot" constants -@ -Lk_inv:@ inv, inva -.quad 0x0E05060F0D080180, 0x040703090A0B0C02 -.quad 0x01040A060F0B0780, 0x030D0E0C02050809 -Lk_ipt:@ input transform (lo, hi) -.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 -.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 -Lk_sbo:@ sbou, sbot -.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 -.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA -Lk_sb1:@ sb1u, sb1t -.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF -.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 -Lk_sb2:@ sb2u, sb2t -.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A -.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD - -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 -.align 2 - -.align 6 -@@ -@@ _aes_preheat -@@ -@@ Fills q9-q15 as specified below. -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_preheat -#endif -.align 4 -_vpaes_preheat: - adr r10, Lk_inv - vmov.i8 q9, #0x0f @ Lk_s0F - vld1.64 {q10,q11}, [r10]! @ Lk_inv - add r10, r10, #64 @ Skip Lk_ipt, Lk_sbo - vld1.64 {q12,q13}, [r10]! @ Lk_sb1 - vld1.64 {q14,q15}, [r10] @ Lk_sb2 - bx lr - -@@ -@@ _aes_encrypt_core -@@ -@@ AES-encrypt q0. -@@ -@@ Inputs: -@@ q0 = input -@@ q9-q15 as in _vpaes_preheat -@@ [r2] = scheduled keys -@@ -@@ Output in q0 -@@ Clobbers q1-q5, r8-r11 -@@ Preserves q6-q8 so you get some local vectors -@@ -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_encrypt_core -#endif -.align 4 -_vpaes_encrypt_core: - mov r9, r2 - ldr r8, [r2,#240] @ pull rounds - adr r11, Lk_ipt - @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi - vld1.64 {q2, q3}, [r11] - adr r11, Lk_mc_forward+16 - vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 - vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 - vtbl.8 d3, {q2}, d3 - vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 - vtbl.8 d5, {q3}, d1 - veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 - veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 - - @ .Lenc_entry ends with a bnz instruction which is normally paired with - @ subs in .Lenc_loop. - tst r8, r8 - b Lenc_entry - -.align 4 -Lenc_loop: - @ middle of middle round - add r10, r11, #0x40 - vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u - vtbl.8 d9, {q13}, d5 - vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] - vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t - vtbl.8 d1, {q12}, d7 - veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u - vtbl.8 d11, {q15}, d5 - veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A - vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t - vtbl.8 d5, {q14}, d7 - vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] - vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B - vtbl.8 d7, {q0}, d3 - veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A - @ Write to q5 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D - vtbl.8 d11, {q0}, d9 - veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B - vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C - vtbl.8 d9, {q3}, d3 - @ Here we restore the original q0/q5 usage. - veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D - and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 - veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D - subs r8, r8, #1 @ nr-- - -Lenc_entry: - @ top of round - vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i - vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k - vtbl.8 d11, {q11}, d3 - veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j - vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - vtbl.8 d7, {q10}, d1 - vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - vtbl.8 d9, {q10}, d3 - veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - vtbl.8 d5, {q10}, d7 - vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - vtbl.8 d7, {q10}, d9 - veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io - veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 - bne Lenc_loop - - @ middle of last round - add r10, r11, #0x80 - - adr r11, Lk_sbo - @ Read to q1 instead of q4, so the vtbl.8 instruction below does not - @ overlap table and destination registers. - vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou - vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot Lk_sbo+16 - vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - vtbl.8 d9, {q1}, d5 - vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] - @ Write to q2 instead of q0 below, to avoid overlapping table and - @ destination registers. - vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t - vtbl.8 d5, {q0}, d7 - veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A - @ Here we restore the original q0/q2 usage. - vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 - vtbl.8 d1, {q2}, d3 - bx lr - - -.globl _vpaes_encrypt -.private_extern _vpaes_encrypt -#ifdef __thumb2__ -.thumb_func _vpaes_encrypt -#endif -.align 4 -_vpaes_encrypt: - @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack - @ alignment. - stmdb sp!, {r7,r8,r9,r10,r11,lr} - @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. - vstmdb sp!, {d8,d9,d10,d11} - - vld1.64 {q0}, [r0] - bl _vpaes_preheat - bl _vpaes_encrypt_core - vst1.64 {q0}, [r1] - - vldmia sp!, {d8,d9,d10,d11} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return - - -@ -@ Decryption stuff -@ - -.align 4 -_vpaes_decrypt_consts: -Lk_dipt:@ decryption input transform -.quad 0x0F505B040B545F00, 0x154A411E114E451A -.quad 0x86E383E660056500, 0x12771772F491F194 -Lk_dsbo:@ decryption sbox final output -.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D -.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C -Lk_dsb9:@ decryption sbox output *9*u, *9*t -.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 -.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 -Lk_dsbd:@ decryption sbox output *D*u, *D*t -.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 -.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 -Lk_dsbb:@ decryption sbox output *B*u, *B*t -.quad 0xD022649296B44200, 0x602646F6B0F2D404 -.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B -Lk_dsbe:@ decryption sbox output *E*u, *E*t -.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 -.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 - - -@@ -@@ Decryption core -@@ -@@ Same API as encryption core, except it clobbers q12-q15 rather than using -@@ the values from _vpaes_preheat. q9-q11 must still be set from -@@ _vpaes_preheat. -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_decrypt_core -#endif -.align 4 -_vpaes_decrypt_core: - mov r9, r2 - ldr r8, [r2,#240] @ pull rounds - - @ This function performs shuffles with various constants. The x86_64 - @ version loads them on-demand into %xmm0-%xmm5. This does not work well - @ for ARMv7 because those registers are shuffle destinations. The ARMv8 - @ version preloads those constants into registers, but ARMv7 has half - @ the registers to work with. Instead, we load them on-demand into - @ q12-q15, registers normally use for preloaded constants. This is fine - @ because decryption doesn't use those constants. The values are - @ constant, so this does not interfere with potential 2x optimizations. - adr r7, Lk_dipt - - vld1.64 {q12,q13}, [r7] @ vmovdqa Lk_dipt(%rip), %xmm2 # iptlo - lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11 - eor r11, r11, #0x30 @ xor $0x30, %r11 - adr r10, Lk_sr - and r11, r11, #0x30 @ and $0x30, %r11 - add r11, r11, r10 - adr r10, Lk_mc_forward+48 - - vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 - vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 - vtbl.8 d5, {q12}, d3 - vld1.64 {q5}, [r10] @ vmovdqa Lk_mc_forward+48(%rip), %xmm5 - @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi - vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 - vtbl.8 d1, {q13}, d1 - veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2 - veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 - - @ .Ldec_entry ends with a bnz instruction which is normally paired with - @ subs in .Ldec_loop. - tst r8, r8 - b Ldec_entry - -.align 4 -Ldec_loop: -@ -@ Inverse mix columns -@ - - @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of - @ the function. - adr r10, Lk_dsb9 - vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u - @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t - @ Load sbd* ahead of time. - vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu - @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt - vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u - vtbl.8 d9, {q12}, d5 - vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t - vtbl.8 d3, {q13}, d7 - veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0 - - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - - @ Load sbb* ahead of time. - vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu - @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt - - vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu - vtbl.8 d9, {q14}, d5 - @ Write to q1 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch - vtbl.8 d3, {q0}, d11 - @ Here we restore the original q0/q1 usage. This instruction is - @ reordered from the ARMv8 version so we do not clobber the vtbl.8 - @ below. - veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt - vtbl.8 d3, {q15}, d7 - @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt - - @ Load sbd* ahead of time. - vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu - @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet - - vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu - vtbl.8 d9, {q12}, d5 - @ Write to q1 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch - vtbl.8 d3, {q0}, d11 - @ Here we restore the original q0/q1 usage. This instruction is - @ reordered from the ARMv8 version so we do not clobber the vtbl.8 - @ below. - veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt - vtbl.8 d3, {q13}, d7 - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - - vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu - vtbl.8 d9, {q14}, d5 - @ Write to q1 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch - vtbl.8 d3, {q0}, d11 - @ Here we restore the original q0/q1 usage. This instruction is - @ reordered from the ARMv8 version so we do not clobber the vtbl.8 - @ below. - veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet - vtbl.8 d3, {q15}, d7 - vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5 - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - subs r8, r8, #1 @ sub $1,%rax # nr-- - -Ldec_entry: - @ top of round - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i - vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - vtbl.8 d5, {q11}, d3 - veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j - vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - vtbl.8 d7, {q10}, d1 - vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - vtbl.8 d9, {q10}, d3 - veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - vtbl.8 d5, {q10}, d7 - vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - vtbl.8 d7, {q10}, d9 - veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io - veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0 - bne Ldec_loop - - @ middle of last round - - adr r10, Lk_dsbo - - @ Write to q1 rather than q4 to avoid overlapping table and destination. - vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou - vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - vtbl.8 d9, {q1}, d5 - @ Write to q2 rather than q1 to avoid overlapping table and destination. - vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot - vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t - vtbl.8 d3, {q2}, d7 - vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 - veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k - @ Write to q1 rather than q0 so the table and destination registers - @ below do not overlap. - veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A - vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0 - vtbl.8 d1, {q1}, d5 - bx lr - - -.globl _vpaes_decrypt -.private_extern _vpaes_decrypt -#ifdef __thumb2__ -.thumb_func _vpaes_decrypt -#endif -.align 4 -_vpaes_decrypt: - @ _vpaes_decrypt_core uses r7-r11. - stmdb sp!, {r7,r8,r9,r10,r11,lr} - @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved. - vstmdb sp!, {d8,d9,d10,d11} - - vld1.64 {q0}, [r0] - bl _vpaes_preheat - bl _vpaes_decrypt_core - vst1.64 {q0}, [r1] - - vldmia sp!, {d8,d9,d10,d11} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return - -@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ -@@ @@ -@@ AES key schedule @@ -@@ @@ -@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - -@ This function diverges from both x86_64 and armv7 in which constants are -@ pinned. x86_64 has a common preheat function for all operations. aarch64 -@ separates them because it has enough registers to pin nearly all constants. -@ armv7 does not have enough registers, but needing explicit loads and stores -@ also complicates using x86_64's register allocation directly. -@ -@ We pin some constants for convenience and leave q14 and q15 free to load -@ others on demand. - -@ -@ Key schedule constants -@ - -.align 4 -_vpaes_key_consts: -Lk_dksd:@ decryption key schedule: invskew x*D -.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 -.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E -Lk_dksb:@ decryption key schedule: invskew x*B -.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 -.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 -Lk_dkse:@ decryption key schedule: invskew x*E + 0x63 -.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 -.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 -Lk_dks9:@ decryption key schedule: invskew x*9 -.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC -.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE - -Lk_rcon:@ rcon -.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 - -Lk_opt:@ output transform -.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 -.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 -Lk_deskew:@ deskew tables: inverts the sbox's "skew" -.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A -.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 - - -#ifdef __thumb2__ -.thumb_func _vpaes_key_preheat -#endif -.align 4 -_vpaes_key_preheat: - adr r11, Lk_rcon - vmov.i8 q12, #0x5b @ Lk_s63 - adr r10, Lk_inv @ Must be aligned to 8 mod 16. - vmov.i8 q9, #0x0f @ Lk_s0F - vld1.64 {q10,q11}, [r10] @ Lk_inv - vld1.64 {q8}, [r11] @ Lk_rcon - bx lr - - -#ifdef __thumb2__ -.thumb_func _vpaes_schedule_core -#endif -.align 4 -_vpaes_schedule_core: - @ We only need to save lr, but ARM requires an 8-byte stack alignment, - @ so save an extra register. - stmdb sp!, {r3,lr} - - bl _vpaes_key_preheat @ load the tables - - adr r11, Lk_ipt @ Must be aligned to 8 mod 16. - vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) - - @ input transform - @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not - @ overlap table and destination. - vmov q4, q0 @ vmovdqa %xmm0, %xmm3 - bl _vpaes_schedule_transform - adr r10, Lk_sr @ Must be aligned to 8 mod 16. - vmov q7, q0 @ vmovdqa %xmm0, %xmm7 - - add r8, r8, r10 - tst r3, r3 - bne Lschedule_am_decrypting - - @ encrypting, output zeroth round key after transform - vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) - b Lschedule_go - -Lschedule_am_decrypting: - @ decrypting, output zeroth round key after shiftrows - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 - vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q4}, d3 - vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx) - eor r8, r8, #0x30 @ xor $0x30, %r8 - -Lschedule_go: - cmp r1, #192 @ cmp $192, %esi - bhi Lschedule_256 - beq Lschedule_192 - @ 128: fall though - -@@ -@@ .schedule_128 -@@ -@@ 128-bit specific part of key schedule. -@@ -@@ This schedule is really simple, because all its parts -@@ are accomplished by the subroutines. -@@ -Lschedule_128: - mov r0, #10 @ mov $10, %esi - -Loop_schedule_128: - bl _vpaes_schedule_round - subs r0, r0, #1 @ dec %esi - beq Lschedule_mangle_last - bl _vpaes_schedule_mangle @ write output - b Loop_schedule_128 - -@@ -@@ .aes_schedule_192 -@@ -@@ 192-bit specific part of key schedule. -@@ -@@ The main body of this schedule is the same as the 128-bit -@@ schedule, but with more smearing. The long, high side is -@@ stored in q7 as before, and the short, low side is in -@@ the high bits of q6. -@@ -@@ This schedule is somewhat nastier, however, because each -@@ round produces 192 bits of key material, or 1.5 round keys. -@@ Therefore, on each cycle we do 2 rounds and produce 3 round -@@ keys. -@@ -.align 4 -Lschedule_192: - sub r0, r0, #8 - vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) - bl _vpaes_schedule_transform @ input transform - vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part - vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4 - @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros - mov r0, #4 @ mov $4, %esi - -Loop_schedule_192: - bl _vpaes_schedule_round - vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0 - bl _vpaes_schedule_mangle @ save key n - bl _vpaes_schedule_192_smear - bl _vpaes_schedule_mangle @ save key n+1 - bl _vpaes_schedule_round - subs r0, r0, #1 @ dec %esi - beq Lschedule_mangle_last - bl _vpaes_schedule_mangle @ save key n+2 - bl _vpaes_schedule_192_smear - b Loop_schedule_192 - -@@ -@@ .aes_schedule_256 -@@ -@@ 256-bit specific part of key schedule. -@@ -@@ The structure here is very similar to the 128-bit -@@ schedule, but with an additional "low side" in -@@ q6. The low side's rounds are the same as the -@@ high side's, except no rcon and no rotation. -@@ -.align 4 -Lschedule_256: - vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) - bl _vpaes_schedule_transform @ input transform - mov r0, #7 @ mov $7, %esi - -Loop_schedule_256: - bl _vpaes_schedule_mangle @ output low result - vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 - - @ high round - bl _vpaes_schedule_round - subs r0, r0, #1 @ dec %esi - beq Lschedule_mangle_last - bl _vpaes_schedule_mangle - - @ low round. swap xmm7 and xmm6 - vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 - vmov.i8 q4, #0 - vmov q5, q7 @ vmovdqa %xmm7, %xmm5 - vmov q7, q6 @ vmovdqa %xmm6, %xmm7 - bl _vpaes_schedule_low_round - vmov q7, q5 @ vmovdqa %xmm5, %xmm7 - - b Loop_schedule_256 - -@@ -@@ .aes_schedule_mangle_last -@@ -@@ Mangler for last round of key schedule -@@ Mangles q0 -@@ when encrypting, outputs out(q0) ^ 63 -@@ when decrypting, outputs unskew(q0) -@@ -@@ Always called right before return... jumps to cleanup and exits -@@ -.align 4 -Lschedule_mangle_last: - @ schedule last round key from xmm0 - adr r11, Lk_deskew @ lea Lk_deskew(%rip),%r11 # prepare to deskew - tst r3, r3 - bne Lschedule_mangle_last_dec - - @ encrypting - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 - adr r11, Lk_opt @ lea Lk_opt(%rip), %r11 # prepare to output transform - add r2, r2, #32 @ add $32, %rdx - vmov q2, q0 - vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute - vtbl.8 d1, {q2}, d3 - -Lschedule_mangle_last_dec: - sub r2, r2, #16 @ add $-16, %rdx - veor q0, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm0 - bl _vpaes_schedule_transform @ output transform - vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key - - @ cleanup - veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 - veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 - veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 - veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 - veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 - veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 - veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 - veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 - ldmia sp!, {r3,pc} @ return - - -@@ -@@ .aes_schedule_192_smear -@@ -@@ Smear the short, low side in the 192-bit key schedule. -@@ -@@ Inputs: -@@ q7: high side, b a x y -@@ q6: low side, d c 0 0 -@@ -@@ Outputs: -@@ q6: b+c+d b+c 0 0 -@@ q0: b+c+d b+c b a -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_schedule_192_smear -#endif -.align 4 -_vpaes_schedule_192_smear: - vmov.i8 q1, #0 - vdup.32 q0, d15[1] - vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 - vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a - veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 - veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 - veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a - vmov q0, q6 @ vmovdqa %xmm6, %xmm0 - vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros - bx lr - - -@@ -@@ .aes_schedule_round -@@ -@@ Runs one main round of the key schedule on q0, q7 -@@ -@@ Specifically, runs subbytes on the high dword of q0 -@@ then rotates it by one byte and xors into the low dword of -@@ q7. -@@ -@@ Adds rcon from low byte of q8, then rotates q8 for -@@ next rcon. -@@ -@@ Smears the dwords of q7 by xoring the low into the -@@ second low, result into third, result into highest. -@@ -@@ Returns results in q7 = q0. -@@ Clobbers q1-q4, r11. -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_schedule_round -#endif -.align 4 -_vpaes_schedule_round: - @ extract rcon from xmm8 - vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 - vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 - vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 - veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 - - @ rotate - vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 - vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 - - @ fall through... - - @ low round: same as high round, but no rotation and no rcon. -_vpaes_schedule_low_round: - @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. - @ We pin other values in _vpaes_key_preheat, so load them now. - adr r11, Lk_sb1 - vld1.64 {q14,q15}, [r11] - - @ smear xmm7 - vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 - veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 - vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 - - @ subbytes - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i - veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 - vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - vtbl.8 d5, {q11}, d3 - veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j - vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - vtbl.8 d7, {q10}, d1 - veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - vtbl.8 d9, {q10}, d3 - veor q7, q7, q12 @ vpxor Lk_s63(%rip), %xmm7, %xmm7 - vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak - vtbl.8 d7, {q10}, d7 - veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak - vtbl.8 d5, {q10}, d9 - veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io - veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo - vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou - vtbl.8 d9, {q15}, d7 - vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t - vtbl.8 d3, {q14}, d5 - veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output - - @ add in smeared stuff - veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 - veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 - bx lr - - -@@ -@@ .aes_schedule_transform -@@ -@@ Linear-transform q0 according to tables at [r11] -@@ -@@ Requires that q9 = 0x0F0F... as in preheat -@@ Output in q0 -@@ Clobbers q1, q2, q14, q15 -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_schedule_transform -#endif -.align 4 -_vpaes_schedule_transform: - vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo - @ vmovdqa 16(%r11), %xmm1 # hi - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 - vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d3 - vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 - vtbl.8 d1, {q15}, d1 - veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 - bx lr - - -@@ -@@ .aes_schedule_mangle -@@ -@@ Mangles q0 from (basis-transformed) standard version -@@ to our version. -@@ -@@ On encrypt, -@@ xor with 0x63 -@@ multiply by circulant 0,1,1,1 -@@ apply shiftrows transform -@@ -@@ On decrypt, -@@ xor with 0x63 -@@ multiply by "inverse mixcolumns" circulant E,B,D,9 -@@ deskew -@@ apply shiftrows transform -@@ -@@ -@@ Writes out to [r2], and increments or decrements it -@@ Keeps track of round number mod 4 in r8 -@@ Preserves q0 -@@ Clobbers q1-q5 -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_schedule_mangle -#endif -.align 4 -_vpaes_schedule_mangle: - tst r3, r3 - vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later - adr r11, Lk_mc_forward @ Must be aligned to 8 mod 16. - vld1.64 {q5}, [r11] @ vmovdqa Lk_mc_forward(%rip),%xmm5 - bne Lschedule_mangle_dec - - @ encrypting - @ Write to q2 so we do not overlap table and destination below. - veor q2, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm4 - add r2, r2, #16 @ add $16, %rdx - vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 - vtbl.8 d9, {q2}, d11 - vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 - vtbl.8 d3, {q4}, d11 - vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 - vtbl.8 d7, {q1}, d11 - veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 - veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 - - b Lschedule_mangle_both -.align 4 -Lschedule_mangle_dec: - @ inverse mix columns - adr r11, Lk_dksd @ lea Lk_dksd(%rip),%r11 - vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi - vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo - - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2 - @ vmovdqa 0x10(%r11), %xmm3 - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q15}, d3 - @ Load .Lk_dksb ahead of time. - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2 - @ vmovdqa 0x30(%r11), %xmm3 - @ Write to q13 so we do not overlap table and destination. - veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 - vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 - vtbl.8 d7, {q13}, d11 - - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 - vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q15}, d3 - @ Load .Lk_dkse ahead of time. - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2 - @ vmovdqa 0x50(%r11), %xmm3 - @ Write to q13 so we do not overlap table and destination. - veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 - vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 - vtbl.8 d7, {q13}, d11 - - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 - vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q15}, d3 - @ Load .Lk_dkse ahead of time. - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2 - @ vmovdqa 0x70(%r11), %xmm4 - @ Write to q13 so we do not overlap table and destination. - veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 - - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 - vtbl.8 d7, {q13}, d11 - vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4 - vtbl.8 d9, {q15}, d3 - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 - veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 - veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3 - - sub r2, r2, #16 @ add $-16, %rdx - -Lschedule_mangle_both: - @ Write to q2 so table and destination do not overlap. - vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d5, {q3}, d3 - add r8, r8, #64-16 @ add $-16, %r8 - and r8, r8, #~(1<<6) @ and $0x30, %r8 - vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) - bx lr - - -.globl _vpaes_set_encrypt_key -.private_extern _vpaes_set_encrypt_key -#ifdef __thumb2__ -.thumb_func _vpaes_set_encrypt_key -#endif -.align 4 -_vpaes_set_encrypt_key: - stmdb sp!, {r7,r8,r9,r10,r11, lr} - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - lsr r9, r1, #5 @ shr $5,%eax - add r9, r9, #5 @ $5,%eax - str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - - mov r3, #0 @ mov $0,%ecx - mov r8, #0x30 @ mov $0x30,%r8d - bl _vpaes_schedule_core - eor r0, r0, r0 - - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return - - -.globl _vpaes_set_decrypt_key -.private_extern _vpaes_set_decrypt_key -#ifdef __thumb2__ -.thumb_func _vpaes_set_decrypt_key -#endif -.align 4 -_vpaes_set_decrypt_key: - stmdb sp!, {r7,r8,r9,r10,r11, lr} - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - lsr r9, r1, #5 @ shr $5,%eax - add r9, r9, #5 @ $5,%eax - str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - lsl r9, r9, #4 @ shl $4,%eax - add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx - add r2, r2, r9 - - mov r3, #1 @ mov $1,%ecx - lsr r8, r1, #1 @ shr $1,%r8d - and r8, r8, #32 @ and $32,%r8d - eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32 - bl _vpaes_schedule_core - - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return - - -@ Additional constants for converting to bsaes. - -.align 4 -_vpaes_convert_consts: -@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear -@ transform in the AES S-box. 0x63 is incorporated into the low half of the -@ table. This was computed with the following script: -@ -@ def u64s_to_u128(x, y): -@ return x | (y << 64) -@ def u128_to_u64s(w): -@ return w & ((1<<64)-1), w >> 64 -@ def get_byte(w, i): -@ return (w >> (i*8)) & 0xff -@ def apply_table(table, b): -@ lo = b & 0xf -@ hi = b >> 4 -@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) -@ def opt(b): -@ table = [ -@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), -@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), -@ ] -@ return apply_table(table, b) -@ def rot_byte(b, n): -@ return 0xff & ((b << n) | (b >> (8-n))) -@ def skew(x): -@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ -@ rot_byte(x, 4)) -@ table = [0, 0] -@ for i in range(16): -@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) -@ table[1] |= skew(opt(i<<4)) << (i*8) -@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) -@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) -Lk_opt_then_skew: -.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b -.quad 0x1f30062936192f00, 0xb49bad829db284ab - -@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation -@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344 -@ becomes 0x22334411 and then 0x11443322. -Lk_decrypt_transform: -.quad 0x0704050603000102, 0x0f0c0d0e0b08090a - - -@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); -.globl _vpaes_encrypt_key_to_bsaes -.private_extern _vpaes_encrypt_key_to_bsaes -#ifdef __thumb2__ -.thumb_func _vpaes_encrypt_key_to_bsaes -#endif -.align 4 -_vpaes_encrypt_key_to_bsaes: - stmdb sp!, {r11, lr} - - @ See _vpaes_schedule_core for the key schedule logic. In particular, - @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), - @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last - @ contain the transformations not in the bsaes representation. This - @ function inverts those transforms. - @ - @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key - @ representation, which does not match the other aes_nohw_* - @ implementations. The ARM aes_nohw_* stores each 32-bit word - @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the - @ cost of extra REV and VREV32 operations in little-endian ARM. - - vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform - adr r2, Lk_mc_forward @ Must be aligned to 8 mod 16. - add r3, r2, 0x90 @ Lk_sr+0x10-Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) - - vld1.64 {q12}, [r2] - vmov.i8 q10, #0x5b @ Lk_s63 from vpaes-x86_64 - adr r11, Lk_opt @ Must be aligned to 8 mod 16. - vmov.i8 q11, #0x63 @ LK_s63 without Lk_ipt applied - - @ vpaes stores one fewer round count than bsaes, but the number of keys - @ is the same. - ldr r2, [r1,#240] - add r2, r2, #1 - str r2, [r0,#240] - - @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). - @ Invert this with .Lk_opt. - vld1.64 {q0}, [r1]! - bl _vpaes_schedule_transform - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - - @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, - @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, - @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. -Loop_enc_key_to_bsaes: - vld1.64 {q0}, [r1]! - - @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle - @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. - @ We use r3 rather than r8 to avoid a callee-saved register. - vld1.64 {q1}, [r3] - vtbl.8 d4, {q0}, d2 - vtbl.8 d5, {q0}, d3 - add r3, r3, #16 - and r3, r3, #~(1<<6) - vmov q0, q2 - - @ Handle the last key differently. - subs r2, r2, #1 - beq Loop_enc_key_to_bsaes_last - - @ Multiply by the circulant. This is its own inverse. - vtbl.8 d2, {q0}, d24 - vtbl.8 d3, {q0}, d25 - vmov q0, q1 - vtbl.8 d4, {q1}, d24 - vtbl.8 d5, {q1}, d25 - veor q0, q0, q2 - vtbl.8 d2, {q2}, d24 - vtbl.8 d3, {q2}, d25 - veor q0, q0, q1 - - @ XOR and finish. - veor q0, q0, q10 - bl _vpaes_schedule_transform - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - b Loop_enc_key_to_bsaes - -Loop_enc_key_to_bsaes_last: - @ The final key does not have a basis transform (note - @ .Lschedule_mangle_last inverts the original transform). It only XORs - @ 0x63 and applies ShiftRows. The latter was already inverted in the - @ loop. Note that, because we act on the original representation, we use - @ q11, not q10. - veor q0, q0, q11 - vrev32.8 q0, q0 - vst1.64 {q0}, [r0] - - @ Wipe registers which contained key material. - veor q0, q0, q0 - veor q1, q1, q1 - veor q2, q2, q2 - - ldmia sp!, {r11, pc} @ return - - -@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes); -.globl _vpaes_decrypt_key_to_bsaes -.private_extern _vpaes_decrypt_key_to_bsaes -#ifdef __thumb2__ -.thumb_func _vpaes_decrypt_key_to_bsaes -#endif -.align 4 -_vpaes_decrypt_key_to_bsaes: - stmdb sp!, {r11, lr} - - @ See _vpaes_schedule_core for the key schedule logic. Note vpaes - @ computes the decryption key schedule in reverse. Additionally, - @ aes-x86_64.pl shares some transformations, so we must only partially - @ invert vpaes's transformations. In general, vpaes computes in a - @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of - @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is - @ split into a linear skew and XOR of 0x63). We undo all but MixColumns. - @ - @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key - @ representation, which does not match the other aes_nohw_* - @ implementations. The ARM aes_nohw_* stores each 32-bit word - @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the - @ cost of extra REV and VREV32 operations in little-endian ARM. - - adr r2, Lk_decrypt_transform - adr r3, Lk_sr+0x30 - adr r11, Lk_opt_then_skew @ Input to _vpaes_schedule_transform. - vld1.64 {q12}, [r2] @ Reuse q12 from encryption. - vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform - - @ vpaes stores one fewer round count than bsaes, but the number of keys - @ is the same. - ldr r2, [r1,#240] - add r2, r2, #1 - str r2, [r0,#240] - - @ Undo the basis change and reapply the S-box affine transform. See - @ .Lschedule_mangle_last. - vld1.64 {q0}, [r1]! - bl _vpaes_schedule_transform - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - - @ See _vpaes_schedule_mangle for the transform on the middle keys. Note - @ it simultaneously inverts MixColumns and the S-box affine transform. - @ See .Lk_dksd through .Lk_dks9. -Loop_dec_key_to_bsaes: - vld1.64 {q0}, [r1]! - - @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going - @ forwards cancels inverting for which direction we cycle r3. We use r3 - @ rather than r8 to avoid a callee-saved register. - vld1.64 {q1}, [r3] - vtbl.8 d4, {q0}, d2 - vtbl.8 d5, {q0}, d3 - add r3, r3, #64-16 - and r3, r3, #~(1<<6) - vmov q0, q2 - - @ Handle the last key differently. - subs r2, r2, #1 - beq Loop_dec_key_to_bsaes_last - - @ Undo the basis change and reapply the S-box affine transform. - bl _vpaes_schedule_transform - - @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We - @ combine the two operations in .Lk_decrypt_transform. - @ - @ TODO(davidben): Where does the rotation come from? - vtbl.8 d2, {q0}, d24 - vtbl.8 d3, {q0}, d25 - - vst1.64 {q1}, [r0]! - b Loop_dec_key_to_bsaes - -Loop_dec_key_to_bsaes_last: - @ The final key only inverts ShiftRows (already done in the loop). See - @ .Lschedule_am_decrypting. Its basis is not transformed. - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - - @ Wipe registers which contained key material. - veor q0, q0, q0 - veor q1, q1, q1 - veor q2, q2, q2 - - ldmia sp!, {r11, pc} @ return - -.globl _vpaes_ctr32_encrypt_blocks -.private_extern _vpaes_ctr32_encrypt_blocks -#ifdef __thumb2__ -.thumb_func _vpaes_ctr32_encrypt_blocks -#endif -.align 4 -_vpaes_ctr32_encrypt_blocks: - mov ip, sp - stmdb sp!, {r7,r8,r9,r10,r11, lr} - @ This function uses q4-q7 (d8-d15), which are callee-saved. - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - cmp r2, #0 - @ r8 is passed on the stack. - ldr r8, [ip] - beq Lctr32_done - - @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. - mov r9, r3 - mov r3, r2 - mov r2, r9 - - @ Load the IV and counter portion. - ldr r7, [r8, #12] - vld1.8 {q7}, [r8] - - bl _vpaes_preheat - rev r7, r7 @ The counter is big-endian. - -Lctr32_loop: - vmov q0, q7 - vld1.8 {q6}, [r0]! @ Load input ahead of time - bl _vpaes_encrypt_core - veor q0, q0, q6 @ XOR input and result - vst1.8 {q0}, [r1]! - subs r3, r3, #1 - @ Update the counter. - add r7, r7, #1 - rev r9, r7 - vmov.32 d15[1], r9 - bne Lctr32_loop - -Lctr32_done: - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return - -#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__) -#endif // defined(__arm__) && defined(__APPLE__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-windows.windows.x86.S deleted file mode 100644 index e0c1039a..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-windows.windows.x86.S +++ /dev/null @@ -1,686 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -%ifdef BORINGSSL_DISPATCH_TEST -extern _BORINGSSL_function_hit -%endif -align 64 -L$_vpaes_consts: -dd 218628480,235210255,168496130,67568393 -dd 252381056,17041926,33884169,51187212 -dd 252645135,252645135,252645135,252645135 -dd 1512730624,3266504856,1377990664,3401244816 -dd 830229760,1275146365,2969422977,3447763452 -dd 3411033600,2979783055,338359620,2782886510 -dd 4209124096,907596821,221174255,1006095553 -dd 191964160,3799684038,3164090317,1589111125 -dd 182528256,1777043520,2877432650,3265356744 -dd 1874708224,3503451415,3305285752,363511674 -dd 1606117888,3487855781,1093350906,2384367825 -dd 197121,67569157,134941193,202313229 -dd 67569157,134941193,202313229,197121 -dd 134941193,202313229,197121,67569157 -dd 202313229,197121,67569157,134941193 -dd 33619971,100992007,168364043,235736079 -dd 235736079,33619971,100992007,168364043 -dd 168364043,235736079,33619971,100992007 -dd 100992007,168364043,235736079,33619971 -dd 50462976,117835012,185207048,252579084 -dd 252314880,51251460,117574920,184942860 -dd 184682752,252054788,50987272,118359308 -dd 118099200,185467140,251790600,50727180 -dd 2946363062,528716217,1300004225,1881839624 -dd 1532713819,1532713819,1532713819,1532713819 -dd 3602276352,4288629033,3737020424,4153884961 -dd 1354558464,32357713,2958822624,3775749553 -dd 1201988352,132424512,1572796698,503232858 -dd 2213177600,1597421020,4103937655,675398315 -dd 2749646592,4273543773,1511898873,121693092 -dd 3040248576,1103263732,2871565598,1608280554 -dd 2236667136,2588920351,482954393,64377734 -dd 3069987328,291237287,2117370568,3650299247 -dd 533321216,3573750986,2572112006,1401264716 -dd 1339849704,2721158661,548607111,3445553514 -dd 2128193280,3054596040,2183486460,1257083700 -dd 655635200,1165381986,3923443150,2344132524 -dd 190078720,256924420,290342170,357187870 -dd 1610966272,2263057382,4103205268,309794674 -dd 2592527872,2233205587,1335446729,3402964816 -dd 3973531904,3225098121,3002836325,1918774430 -dd 3870401024,2102906079,2284471353,4117666579 -dd 617007872,1021508343,366931923,691083277 -dd 2528395776,3491914898,2968704004,1613121270 -dd 3445188352,3247741094,844474987,4093578302 -dd 651481088,1190302358,1689581232,574775300 -dd 4289380608,206939853,2555985458,2489840491 -dd 2130264064,327674451,3566485037,3349835193 -dd 2470714624,316102159,3636825756,3393945945 -db 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 -db 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 -db 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 -db 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 -db 118,101,114,115,105,116,121,41,0 -align 64 -align 16 -__vpaes_preheat: - add ebp,DWORD [esp] - movdqa xmm7,[ebp-48] - movdqa xmm6,[ebp-16] - ret -align 16 -__vpaes_encrypt_core: - mov ecx,16 - mov eax,DWORD [240+edx] - movdqa xmm1,xmm6 - movdqa xmm2,[ebp] - pandn xmm1,xmm0 - pand xmm0,xmm6 - movdqu xmm5,[edx] -db 102,15,56,0,208 - movdqa xmm0,[16+ebp] - pxor xmm2,xmm5 - psrld xmm1,4 - add edx,16 -db 102,15,56,0,193 - lea ebx,[192+ebp] - pxor xmm0,xmm2 - jmp NEAR L$000enc_entry -align 16 -L$001enc_loop: - movdqa xmm4,[32+ebp] - movdqa xmm0,[48+ebp] -db 102,15,56,0,226 -db 102,15,56,0,195 - pxor xmm4,xmm5 - movdqa xmm5,[64+ebp] - pxor xmm0,xmm4 - movdqa xmm1,[ecx*1+ebx-64] -db 102,15,56,0,234 - movdqa xmm2,[80+ebp] - movdqa xmm4,[ecx*1+ebx] -db 102,15,56,0,211 - movdqa xmm3,xmm0 - pxor xmm2,xmm5 -db 102,15,56,0,193 - add edx,16 - pxor xmm0,xmm2 -db 102,15,56,0,220 - add ecx,16 - pxor xmm3,xmm0 -db 102,15,56,0,193 - and ecx,48 - sub eax,1 - pxor xmm0,xmm3 -L$000enc_entry: - movdqa xmm1,xmm6 - movdqa xmm5,[ebp-32] - pandn xmm1,xmm0 - psrld xmm1,4 - pand xmm0,xmm6 -db 102,15,56,0,232 - movdqa xmm3,xmm7 - pxor xmm0,xmm1 -db 102,15,56,0,217 - movdqa xmm4,xmm7 - pxor xmm3,xmm5 -db 102,15,56,0,224 - movdqa xmm2,xmm7 - pxor xmm4,xmm5 -db 102,15,56,0,211 - movdqa xmm3,xmm7 - pxor xmm2,xmm0 -db 102,15,56,0,220 - movdqu xmm5,[edx] - pxor xmm3,xmm1 - jnz NEAR L$001enc_loop - movdqa xmm4,[96+ebp] - movdqa xmm0,[112+ebp] -db 102,15,56,0,226 - pxor xmm4,xmm5 -db 102,15,56,0,195 - movdqa xmm1,[64+ecx*1+ebx] - pxor xmm0,xmm4 -db 102,15,56,0,193 - ret -align 16 -__vpaes_decrypt_core: - lea ebx,[608+ebp] - mov eax,DWORD [240+edx] - movdqa xmm1,xmm6 - movdqa xmm2,[ebx-64] - pandn xmm1,xmm0 - mov ecx,eax - psrld xmm1,4 - movdqu xmm5,[edx] - shl ecx,4 - pand xmm0,xmm6 -db 102,15,56,0,208 - movdqa xmm0,[ebx-48] - xor ecx,48 -db 102,15,56,0,193 - and ecx,48 - pxor xmm2,xmm5 - movdqa xmm5,[176+ebp] - pxor xmm0,xmm2 - add edx,16 - lea ecx,[ecx*1+ebx-352] - jmp NEAR L$002dec_entry -align 16 -L$003dec_loop: - movdqa xmm4,[ebx-32] - movdqa xmm1,[ebx-16] -db 102,15,56,0,226 -db 102,15,56,0,203 - pxor xmm0,xmm4 - movdqa xmm4,[ebx] - pxor xmm0,xmm1 - movdqa xmm1,[16+ebx] -db 102,15,56,0,226 -db 102,15,56,0,197 -db 102,15,56,0,203 - pxor xmm0,xmm4 - movdqa xmm4,[32+ebx] - pxor xmm0,xmm1 - movdqa xmm1,[48+ebx] -db 102,15,56,0,226 -db 102,15,56,0,197 -db 102,15,56,0,203 - pxor xmm0,xmm4 - movdqa xmm4,[64+ebx] - pxor xmm0,xmm1 - movdqa xmm1,[80+ebx] -db 102,15,56,0,226 -db 102,15,56,0,197 -db 102,15,56,0,203 - pxor xmm0,xmm4 - add edx,16 -db 102,15,58,15,237,12 - pxor xmm0,xmm1 - sub eax,1 -L$002dec_entry: - movdqa xmm1,xmm6 - movdqa xmm2,[ebp-32] - pandn xmm1,xmm0 - pand xmm0,xmm6 - psrld xmm1,4 -db 102,15,56,0,208 - movdqa xmm3,xmm7 - pxor xmm0,xmm1 -db 102,15,56,0,217 - movdqa xmm4,xmm7 - pxor xmm3,xmm2 -db 102,15,56,0,224 - pxor xmm4,xmm2 - movdqa xmm2,xmm7 -db 102,15,56,0,211 - movdqa xmm3,xmm7 - pxor xmm2,xmm0 -db 102,15,56,0,220 - movdqu xmm0,[edx] - pxor xmm3,xmm1 - jnz NEAR L$003dec_loop - movdqa xmm4,[96+ebx] -db 102,15,56,0,226 - pxor xmm4,xmm0 - movdqa xmm0,[112+ebx] - movdqa xmm2,[ecx] -db 102,15,56,0,195 - pxor xmm0,xmm4 -db 102,15,56,0,194 - ret -align 16 -__vpaes_schedule_core: - add ebp,DWORD [esp] - movdqu xmm0,[esi] - movdqa xmm2,[320+ebp] - movdqa xmm3,xmm0 - lea ebx,[ebp] - movdqa [4+esp],xmm2 - call __vpaes_schedule_transform - movdqa xmm7,xmm0 - test edi,edi - jnz NEAR L$004schedule_am_decrypting - movdqu [edx],xmm0 - jmp NEAR L$005schedule_go -L$004schedule_am_decrypting: - movdqa xmm1,[256+ecx*1+ebp] -db 102,15,56,0,217 - movdqu [edx],xmm3 - xor ecx,48 -L$005schedule_go: - cmp eax,192 - ja NEAR L$006schedule_256 - je NEAR L$007schedule_192 -L$008schedule_128: - mov eax,10 -L$009loop_schedule_128: - call __vpaes_schedule_round - dec eax - jz NEAR L$010schedule_mangle_last - call __vpaes_schedule_mangle - jmp NEAR L$009loop_schedule_128 -align 16 -L$007schedule_192: - movdqu xmm0,[8+esi] - call __vpaes_schedule_transform - movdqa xmm6,xmm0 - pxor xmm4,xmm4 - movhlps xmm6,xmm4 - mov eax,4 -L$011loop_schedule_192: - call __vpaes_schedule_round -db 102,15,58,15,198,8 - call __vpaes_schedule_mangle - call __vpaes_schedule_192_smear - call __vpaes_schedule_mangle - call __vpaes_schedule_round - dec eax - jz NEAR L$010schedule_mangle_last - call __vpaes_schedule_mangle - call __vpaes_schedule_192_smear - jmp NEAR L$011loop_schedule_192 -align 16 -L$006schedule_256: - movdqu xmm0,[16+esi] - call __vpaes_schedule_transform - mov eax,7 -L$012loop_schedule_256: - call __vpaes_schedule_mangle - movdqa xmm6,xmm0 - call __vpaes_schedule_round - dec eax - jz NEAR L$010schedule_mangle_last - call __vpaes_schedule_mangle - pshufd xmm0,xmm0,255 - movdqa [20+esp],xmm7 - movdqa xmm7,xmm6 - call L$_vpaes_schedule_low_round - movdqa xmm7,[20+esp] - jmp NEAR L$012loop_schedule_256 -align 16 -L$010schedule_mangle_last: - lea ebx,[384+ebp] - test edi,edi - jnz NEAR L$013schedule_mangle_last_dec - movdqa xmm1,[256+ecx*1+ebp] -db 102,15,56,0,193 - lea ebx,[352+ebp] - add edx,32 -L$013schedule_mangle_last_dec: - add edx,-16 - pxor xmm0,[336+ebp] - call __vpaes_schedule_transform - movdqu [edx],xmm0 - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - pxor xmm6,xmm6 - pxor xmm7,xmm7 - ret -align 16 -__vpaes_schedule_192_smear: - pshufd xmm1,xmm6,128 - pshufd xmm0,xmm7,254 - pxor xmm6,xmm1 - pxor xmm1,xmm1 - pxor xmm6,xmm0 - movdqa xmm0,xmm6 - movhlps xmm6,xmm1 - ret -align 16 -__vpaes_schedule_round: - movdqa xmm2,[8+esp] - pxor xmm1,xmm1 -db 102,15,58,15,202,15 -db 102,15,58,15,210,15 - pxor xmm7,xmm1 - pshufd xmm0,xmm0,255 -db 102,15,58,15,192,1 - movdqa [8+esp],xmm2 -L$_vpaes_schedule_low_round: - movdqa xmm1,xmm7 - pslldq xmm7,4 - pxor xmm7,xmm1 - movdqa xmm1,xmm7 - pslldq xmm7,8 - pxor xmm7,xmm1 - pxor xmm7,[336+ebp] - movdqa xmm4,[ebp-16] - movdqa xmm5,[ebp-48] - movdqa xmm1,xmm4 - pandn xmm1,xmm0 - psrld xmm1,4 - pand xmm0,xmm4 - movdqa xmm2,[ebp-32] -db 102,15,56,0,208 - pxor xmm0,xmm1 - movdqa xmm3,xmm5 -db 102,15,56,0,217 - pxor xmm3,xmm2 - movdqa xmm4,xmm5 -db 102,15,56,0,224 - pxor xmm4,xmm2 - movdqa xmm2,xmm5 -db 102,15,56,0,211 - pxor xmm2,xmm0 - movdqa xmm3,xmm5 -db 102,15,56,0,220 - pxor xmm3,xmm1 - movdqa xmm4,[32+ebp] -db 102,15,56,0,226 - movdqa xmm0,[48+ebp] -db 102,15,56,0,195 - pxor xmm0,xmm4 - pxor xmm0,xmm7 - movdqa xmm7,xmm0 - ret -align 16 -__vpaes_schedule_transform: - movdqa xmm2,[ebp-16] - movdqa xmm1,xmm2 - pandn xmm1,xmm0 - psrld xmm1,4 - pand xmm0,xmm2 - movdqa xmm2,[ebx] -db 102,15,56,0,208 - movdqa xmm0,[16+ebx] -db 102,15,56,0,193 - pxor xmm0,xmm2 - ret -align 16 -__vpaes_schedule_mangle: - movdqa xmm4,xmm0 - movdqa xmm5,[128+ebp] - test edi,edi - jnz NEAR L$014schedule_mangle_dec - add edx,16 - pxor xmm4,[336+ebp] -db 102,15,56,0,229 - movdqa xmm3,xmm4 -db 102,15,56,0,229 - pxor xmm3,xmm4 -db 102,15,56,0,229 - pxor xmm3,xmm4 - jmp NEAR L$015schedule_mangle_both -align 16 -L$014schedule_mangle_dec: - movdqa xmm2,[ebp-16] - lea esi,[416+ebp] - movdqa xmm1,xmm2 - pandn xmm1,xmm4 - psrld xmm1,4 - pand xmm4,xmm2 - movdqa xmm2,[esi] -db 102,15,56,0,212 - movdqa xmm3,[16+esi] -db 102,15,56,0,217 - pxor xmm3,xmm2 -db 102,15,56,0,221 - movdqa xmm2,[32+esi] -db 102,15,56,0,212 - pxor xmm2,xmm3 - movdqa xmm3,[48+esi] -db 102,15,56,0,217 - pxor xmm3,xmm2 -db 102,15,56,0,221 - movdqa xmm2,[64+esi] -db 102,15,56,0,212 - pxor xmm2,xmm3 - movdqa xmm3,[80+esi] -db 102,15,56,0,217 - pxor xmm3,xmm2 -db 102,15,56,0,221 - movdqa xmm2,[96+esi] -db 102,15,56,0,212 - pxor xmm2,xmm3 - movdqa xmm3,[112+esi] -db 102,15,56,0,217 - pxor xmm3,xmm2 - add edx,-16 -L$015schedule_mangle_both: - movdqa xmm1,[256+ecx*1+ebp] -db 102,15,56,0,217 - add ecx,-16 - and ecx,48 - movdqu [edx],xmm3 - ret -global _vpaes_set_encrypt_key -align 16 -_vpaes_set_encrypt_key: -L$_vpaes_set_encrypt_key_begin: - push ebp - push ebx - push esi - push edi -%ifdef BORINGSSL_DISPATCH_TEST - push ebx - push edx - call L$016pic -L$016pic: - pop ebx - lea ebx,[(_BORINGSSL_function_hit+5-L$016pic)+ebx] - mov edx,1 - mov BYTE [ebx],dl - pop edx - pop ebx -%endif - mov esi,DWORD [20+esp] - lea ebx,[esp-56] - mov eax,DWORD [24+esp] - and ebx,-16 - mov edx,DWORD [28+esp] - xchg ebx,esp - mov DWORD [48+esp],ebx - mov ebx,eax - shr ebx,5 - add ebx,5 - mov DWORD [240+edx],ebx - mov ecx,48 - mov edi,0 - lea ebp,[(L$_vpaes_consts+0x30-L$017pic_point)] - call __vpaes_schedule_core -L$017pic_point: - mov esp,DWORD [48+esp] - xor eax,eax - pop edi - pop esi - pop ebx - pop ebp - ret -global _vpaes_set_decrypt_key -align 16 -_vpaes_set_decrypt_key: -L$_vpaes_set_decrypt_key_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [20+esp] - lea ebx,[esp-56] - mov eax,DWORD [24+esp] - and ebx,-16 - mov edx,DWORD [28+esp] - xchg ebx,esp - mov DWORD [48+esp],ebx - mov ebx,eax - shr ebx,5 - add ebx,5 - mov DWORD [240+edx],ebx - shl ebx,4 - lea edx,[16+ebx*1+edx] - mov edi,1 - mov ecx,eax - shr ecx,1 - and ecx,32 - xor ecx,32 - lea ebp,[(L$_vpaes_consts+0x30-L$018pic_point)] - call __vpaes_schedule_core -L$018pic_point: - mov esp,DWORD [48+esp] - xor eax,eax - pop edi - pop esi - pop ebx - pop ebp - ret -global _vpaes_encrypt -align 16 -_vpaes_encrypt: -L$_vpaes_encrypt_begin: - push ebp - push ebx - push esi - push edi -%ifdef BORINGSSL_DISPATCH_TEST - push ebx - push edx - call L$019pic -L$019pic: - pop ebx - lea ebx,[(_BORINGSSL_function_hit+4-L$019pic)+ebx] - mov edx,1 - mov BYTE [ebx],dl - pop edx - pop ebx -%endif - lea ebp,[(L$_vpaes_consts+0x30-L$020pic_point)] - call __vpaes_preheat -L$020pic_point: - mov esi,DWORD [20+esp] - lea ebx,[esp-56] - mov edi,DWORD [24+esp] - and ebx,-16 - mov edx,DWORD [28+esp] - xchg ebx,esp - mov DWORD [48+esp],ebx - movdqu xmm0,[esi] - call __vpaes_encrypt_core - movdqu [edi],xmm0 - mov esp,DWORD [48+esp] - pop edi - pop esi - pop ebx - pop ebp - ret -global _vpaes_decrypt -align 16 -_vpaes_decrypt: -L$_vpaes_decrypt_begin: - push ebp - push ebx - push esi - push edi - lea ebp,[(L$_vpaes_consts+0x30-L$021pic_point)] - call __vpaes_preheat -L$021pic_point: - mov esi,DWORD [20+esp] - lea ebx,[esp-56] - mov edi,DWORD [24+esp] - and ebx,-16 - mov edx,DWORD [28+esp] - xchg ebx,esp - mov DWORD [48+esp],ebx - movdqu xmm0,[esi] - call __vpaes_decrypt_core - movdqu [edi],xmm0 - mov esp,DWORD [48+esp] - pop edi - pop esi - pop ebx - pop ebp - ret -global _vpaes_cbc_encrypt -align 16 -_vpaes_cbc_encrypt: -L$_vpaes_cbc_encrypt_begin: - push ebp - push ebx - push esi - push edi - mov esi,DWORD [20+esp] - mov edi,DWORD [24+esp] - mov eax,DWORD [28+esp] - mov edx,DWORD [32+esp] - sub eax,16 - jc NEAR L$022cbc_abort - lea ebx,[esp-56] - mov ebp,DWORD [36+esp] - and ebx,-16 - mov ecx,DWORD [40+esp] - xchg ebx,esp - movdqu xmm1,[ebp] - sub edi,esi - mov DWORD [48+esp],ebx - mov DWORD [esp],edi - mov DWORD [4+esp],edx - mov DWORD [8+esp],ebp - mov edi,eax - lea ebp,[(L$_vpaes_consts+0x30-L$023pic_point)] - call __vpaes_preheat -L$023pic_point: - cmp ecx,0 - je NEAR L$024cbc_dec_loop - jmp NEAR L$025cbc_enc_loop -align 16 -L$025cbc_enc_loop: - movdqu xmm0,[esi] - pxor xmm0,xmm1 - call __vpaes_encrypt_core - mov ebx,DWORD [esp] - mov edx,DWORD [4+esp] - movdqa xmm1,xmm0 - movdqu [esi*1+ebx],xmm0 - lea esi,[16+esi] - sub edi,16 - jnc NEAR L$025cbc_enc_loop - jmp NEAR L$026cbc_done -align 16 -L$024cbc_dec_loop: - movdqu xmm0,[esi] - movdqa [16+esp],xmm1 - movdqa [32+esp],xmm0 - call __vpaes_decrypt_core - mov ebx,DWORD [esp] - mov edx,DWORD [4+esp] - pxor xmm0,[16+esp] - movdqa xmm1,[32+esp] - movdqu [esi*1+ebx],xmm0 - lea esi,[16+esi] - sub edi,16 - jnc NEAR L$024cbc_dec_loop -L$026cbc_done: - mov ebx,DWORD [8+esp] - mov esp,DWORD [48+esp] - movdqu [ebx],xmm1 -L$022cbc_abort: - pop edi - pop esi - pop ebx - pop ebp - ret -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-linux.linux.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-linux.linux.x86.S deleted file mode 100644 index 1da96a46..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-linux.linux.x86.S +++ /dev/null @@ -1,489 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#include - -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -.text -.globl bn_mul_mont -.hidden bn_mul_mont -.type bn_mul_mont,@function -.align 16 -bn_mul_mont: -.L_bn_mul_mont_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - xorl %eax,%eax - movl 40(%esp),%edi - cmpl $4,%edi - jl .L000just_leave - leal 20(%esp),%esi - leal 24(%esp),%edx - addl $2,%edi - negl %edi - leal -32(%esp,%edi,4),%ebp - negl %edi - movl %ebp,%eax - subl %edx,%eax - andl $2047,%eax - subl %eax,%ebp - xorl %ebp,%edx - andl $2048,%edx - xorl $2048,%edx - subl %edx,%ebp - andl $-64,%ebp - movl %esp,%eax - subl %ebp,%eax - andl $-4096,%eax - movl %esp,%edx - leal (%ebp,%eax,1),%esp - movl (%esp),%eax - cmpl %ebp,%esp - ja .L001page_walk - jmp .L002page_walk_done -.align 16 -.L001page_walk: - leal -4096(%esp),%esp - movl (%esp),%eax - cmpl %ebp,%esp - ja .L001page_walk -.L002page_walk_done: - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%ebp - movl 16(%esi),%esi - movl (%esi),%esi - movl %eax,4(%esp) - movl %ebx,8(%esp) - movl %ecx,12(%esp) - movl %ebp,16(%esp) - movl %esi,20(%esp) - leal -3(%edi),%ebx - movl %edx,24(%esp) - call .L003PIC_me_up -.L003PIC_me_up: - popl %eax - leal OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc .L004non_sse2 - movl $-1,%eax - movd %eax,%mm7 - movl 8(%esp),%esi - movl 12(%esp),%edi - movl 16(%esp),%ebp - xorl %edx,%edx - xorl %ecx,%ecx - movd (%edi),%mm4 - movd (%esi),%mm5 - movd (%ebp),%mm3 - pmuludq %mm4,%mm5 - movq %mm5,%mm2 - movq %mm5,%mm0 - pand %mm7,%mm0 - pmuludq 20(%esp),%mm5 - pmuludq %mm5,%mm3 - paddq %mm0,%mm3 - movd 4(%ebp),%mm1 - movd 4(%esi),%mm0 - psrlq $32,%mm2 - psrlq $32,%mm3 - incl %ecx -.align 16 -.L0051st: - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - pand %mm7,%mm0 - movd 4(%ebp,%ecx,4),%mm1 - paddq %mm0,%mm3 - movd 4(%esi,%ecx,4),%mm0 - psrlq $32,%mm2 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm3 - leal 1(%ecx),%ecx - cmpl %ebx,%ecx - jl .L0051st - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - pand %mm7,%mm0 - paddq %mm0,%mm3 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm2 - psrlq $32,%mm3 - paddq %mm2,%mm3 - movq %mm3,32(%esp,%ebx,4) - incl %edx -.L006outer: - xorl %ecx,%ecx - movd (%edi,%edx,4),%mm4 - movd (%esi),%mm5 - movd 32(%esp),%mm6 - movd (%ebp),%mm3 - pmuludq %mm4,%mm5 - paddq %mm6,%mm5 - movq %mm5,%mm0 - movq %mm5,%mm2 - pand %mm7,%mm0 - pmuludq 20(%esp),%mm5 - pmuludq %mm5,%mm3 - paddq %mm0,%mm3 - movd 36(%esp),%mm6 - movd 4(%ebp),%mm1 - movd 4(%esi),%mm0 - psrlq $32,%mm2 - psrlq $32,%mm3 - paddq %mm6,%mm2 - incl %ecx - decl %ebx -.L007inner: - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - movd 36(%esp,%ecx,4),%mm6 - pand %mm7,%mm0 - movd 4(%ebp,%ecx,4),%mm1 - paddq %mm0,%mm3 - movd 4(%esi,%ecx,4),%mm0 - psrlq $32,%mm2 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm3 - paddq %mm6,%mm2 - decl %ebx - leal 1(%ecx),%ecx - jnz .L007inner - movl %ecx,%ebx - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - pand %mm7,%mm0 - paddq %mm0,%mm3 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm2 - psrlq $32,%mm3 - movd 36(%esp,%ebx,4),%mm6 - paddq %mm2,%mm3 - paddq %mm6,%mm3 - movq %mm3,32(%esp,%ebx,4) - leal 1(%edx),%edx - cmpl %ebx,%edx - jle .L006outer - emms - jmp .L008common_tail -.align 16 -.L004non_sse2: - movl 8(%esp),%esi - leal 1(%ebx),%ebp - movl 12(%esp),%edi - xorl %ecx,%ecx - movl %esi,%edx - andl $1,%ebp - subl %edi,%edx - leal 4(%edi,%ebx,4),%eax - orl %edx,%ebp - movl (%edi),%edi - jz .L009bn_sqr_mont - movl %eax,28(%esp) - movl (%esi),%eax - xorl %edx,%edx -.align 16 -.L010mull: - movl %edx,%ebp - mull %edi - addl %eax,%ebp - leal 1(%ecx),%ecx - adcl $0,%edx - movl (%esi,%ecx,4),%eax - cmpl %ebx,%ecx - movl %ebp,28(%esp,%ecx,4) - jl .L010mull - movl %edx,%ebp - mull %edi - movl 20(%esp),%edi - addl %ebp,%eax - movl 16(%esp),%esi - adcl $0,%edx - imull 32(%esp),%edi - movl %eax,32(%esp,%ebx,4) - xorl %ecx,%ecx - movl %edx,36(%esp,%ebx,4) - movl %ecx,40(%esp,%ebx,4) - movl (%esi),%eax - mull %edi - addl 32(%esp),%eax - movl 4(%esi),%eax - adcl $0,%edx - incl %ecx - jmp .L0112ndmadd -.align 16 -.L0121stmadd: - movl %edx,%ebp - mull %edi - addl 32(%esp,%ecx,4),%ebp - leal 1(%ecx),%ecx - adcl $0,%edx - addl %eax,%ebp - movl (%esi,%ecx,4),%eax - adcl $0,%edx - cmpl %ebx,%ecx - movl %ebp,28(%esp,%ecx,4) - jl .L0121stmadd - movl %edx,%ebp - mull %edi - addl 32(%esp,%ebx,4),%eax - movl 20(%esp),%edi - adcl $0,%edx - movl 16(%esp),%esi - addl %eax,%ebp - adcl $0,%edx - imull 32(%esp),%edi - xorl %ecx,%ecx - addl 36(%esp,%ebx,4),%edx - movl %ebp,32(%esp,%ebx,4) - adcl $0,%ecx - movl (%esi),%eax - movl %edx,36(%esp,%ebx,4) - movl %ecx,40(%esp,%ebx,4) - mull %edi - addl 32(%esp),%eax - movl 4(%esi),%eax - adcl $0,%edx - movl $1,%ecx -.align 16 -.L0112ndmadd: - movl %edx,%ebp - mull %edi - addl 32(%esp,%ecx,4),%ebp - leal 1(%ecx),%ecx - adcl $0,%edx - addl %eax,%ebp - movl (%esi,%ecx,4),%eax - adcl $0,%edx - cmpl %ebx,%ecx - movl %ebp,24(%esp,%ecx,4) - jl .L0112ndmadd - movl %edx,%ebp - mull %edi - addl 32(%esp,%ebx,4),%ebp - adcl $0,%edx - addl %eax,%ebp - adcl $0,%edx - movl %ebp,28(%esp,%ebx,4) - xorl %eax,%eax - movl 12(%esp),%ecx - addl 36(%esp,%ebx,4),%edx - adcl 40(%esp,%ebx,4),%eax - leal 4(%ecx),%ecx - movl %edx,32(%esp,%ebx,4) - cmpl 28(%esp),%ecx - movl %eax,36(%esp,%ebx,4) - je .L008common_tail - movl (%ecx),%edi - movl 8(%esp),%esi - movl %ecx,12(%esp) - xorl %ecx,%ecx - xorl %edx,%edx - movl (%esi),%eax - jmp .L0121stmadd -.align 16 -.L009bn_sqr_mont: - movl %ebx,(%esp) - movl %ecx,12(%esp) - movl %edi,%eax - mull %edi - movl %eax,32(%esp) - movl %edx,%ebx - shrl $1,%edx - andl $1,%ebx - incl %ecx -.align 16 -.L013sqr: - movl (%esi,%ecx,4),%eax - movl %edx,%ebp - mull %edi - addl %ebp,%eax - leal 1(%ecx),%ecx - adcl $0,%edx - leal (%ebx,%eax,2),%ebp - shrl $31,%eax - cmpl (%esp),%ecx - movl %eax,%ebx - movl %ebp,28(%esp,%ecx,4) - jl .L013sqr - movl (%esi,%ecx,4),%eax - movl %edx,%ebp - mull %edi - addl %ebp,%eax - movl 20(%esp),%edi - adcl $0,%edx - movl 16(%esp),%esi - leal (%ebx,%eax,2),%ebp - imull 32(%esp),%edi - shrl $31,%eax - movl %ebp,32(%esp,%ecx,4) - leal (%eax,%edx,2),%ebp - movl (%esi),%eax - shrl $31,%edx - movl %ebp,36(%esp,%ecx,4) - movl %edx,40(%esp,%ecx,4) - mull %edi - addl 32(%esp),%eax - movl %ecx,%ebx - adcl $0,%edx - movl 4(%esi),%eax - movl $1,%ecx -.align 16 -.L0143rdmadd: - movl %edx,%ebp - mull %edi - addl 32(%esp,%ecx,4),%ebp - adcl $0,%edx - addl %eax,%ebp - movl 4(%esi,%ecx,4),%eax - adcl $0,%edx - movl %ebp,28(%esp,%ecx,4) - movl %edx,%ebp - mull %edi - addl 36(%esp,%ecx,4),%ebp - leal 2(%ecx),%ecx - adcl $0,%edx - addl %eax,%ebp - movl (%esi,%ecx,4),%eax - adcl $0,%edx - cmpl %ebx,%ecx - movl %ebp,24(%esp,%ecx,4) - jl .L0143rdmadd - movl %edx,%ebp - mull %edi - addl 32(%esp,%ebx,4),%ebp - adcl $0,%edx - addl %eax,%ebp - adcl $0,%edx - movl %ebp,28(%esp,%ebx,4) - movl 12(%esp),%ecx - xorl %eax,%eax - movl 8(%esp),%esi - addl 36(%esp,%ebx,4),%edx - adcl 40(%esp,%ebx,4),%eax - movl %edx,32(%esp,%ebx,4) - cmpl %ebx,%ecx - movl %eax,36(%esp,%ebx,4) - je .L008common_tail - movl 4(%esi,%ecx,4),%edi - leal 1(%ecx),%ecx - movl %edi,%eax - movl %ecx,12(%esp) - mull %edi - addl 32(%esp,%ecx,4),%eax - adcl $0,%edx - movl %eax,32(%esp,%ecx,4) - xorl %ebp,%ebp - cmpl %ebx,%ecx - leal 1(%ecx),%ecx - je .L015sqrlast - movl %edx,%ebx - shrl $1,%edx - andl $1,%ebx -.align 16 -.L016sqradd: - movl (%esi,%ecx,4),%eax - movl %edx,%ebp - mull %edi - addl %ebp,%eax - leal (%eax,%eax,1),%ebp - adcl $0,%edx - shrl $31,%eax - addl 32(%esp,%ecx,4),%ebp - leal 1(%ecx),%ecx - adcl $0,%eax - addl %ebx,%ebp - adcl $0,%eax - cmpl (%esp),%ecx - movl %ebp,28(%esp,%ecx,4) - movl %eax,%ebx - jle .L016sqradd - movl %edx,%ebp - addl %edx,%edx - shrl $31,%ebp - addl %ebx,%edx - adcl $0,%ebp -.L015sqrlast: - movl 20(%esp),%edi - movl 16(%esp),%esi - imull 32(%esp),%edi - addl 32(%esp,%ecx,4),%edx - movl (%esi),%eax - adcl $0,%ebp - movl %edx,32(%esp,%ecx,4) - movl %ebp,36(%esp,%ecx,4) - mull %edi - addl 32(%esp),%eax - leal -1(%ecx),%ebx - adcl $0,%edx - movl $1,%ecx - movl 4(%esi),%eax - jmp .L0143rdmadd -.align 16 -.L008common_tail: - movl 16(%esp),%ebp - movl 4(%esp),%edi - leal 32(%esp),%esi - movl (%esi),%eax - movl %ebx,%ecx - xorl %edx,%edx -.align 16 -.L017sub: - sbbl (%ebp,%edx,4),%eax - movl %eax,(%edi,%edx,4) - decl %ecx - movl 4(%esi,%edx,4),%eax - leal 1(%edx),%edx - jge .L017sub - sbbl $0,%eax - movl $-1,%edx - xorl %eax,%edx - jmp .L018copy -.align 16 -.L018copy: - movl 32(%esp,%ebx,4),%esi - movl (%edi,%ebx,4),%ebp - movl %ecx,32(%esp,%ebx,4) - andl %eax,%esi - andl %edx,%ebp - orl %esi,%ebp - movl %ebp,(%edi,%ebx,4) - decl %ebx - jge .L018copy - movl 24(%esp),%esp - movl $1,%eax -.L000just_leave: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size bn_mul_mont,.-.L_bn_mul_mont_begin -.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 -.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 -.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 -.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 -.byte 111,114,103,62,0 -#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-windows.windows.x86.S deleted file mode 100644 index c6eba63c..00000000 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-windows.windows.x86.S +++ /dev/null @@ -1,497 +0,0 @@ -#define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(_WIN32) -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -%ifidn __OUTPUT_FORMAT__, win32 -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -$@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -;extern _OPENSSL_ia32cap_P -global _bn_mul_mont -align 16 -_bn_mul_mont: -L$_bn_mul_mont_begin: - push ebp - push ebx - push esi - push edi - xor eax,eax - mov edi,DWORD [40+esp] - cmp edi,4 - jl NEAR L$000just_leave - lea esi,[20+esp] - lea edx,[24+esp] - add edi,2 - neg edi - lea ebp,[edi*4+esp-32] - neg edi - mov eax,ebp - sub eax,edx - and eax,2047 - sub ebp,eax - xor edx,ebp - and edx,2048 - xor edx,2048 - sub ebp,edx - and ebp,-64 - mov eax,esp - sub eax,ebp - and eax,-4096 - mov edx,esp - lea esp,[eax*1+ebp] - mov eax,DWORD [esp] - cmp esp,ebp - ja NEAR L$001page_walk - jmp NEAR L$002page_walk_done -align 16 -L$001page_walk: - lea esp,[esp-4096] - mov eax,DWORD [esp] - cmp esp,ebp - ja NEAR L$001page_walk -L$002page_walk_done: - mov eax,DWORD [esi] - mov ebx,DWORD [4+esi] - mov ecx,DWORD [8+esi] - mov ebp,DWORD [12+esi] - mov esi,DWORD [16+esi] - mov esi,DWORD [esi] - mov DWORD [4+esp],eax - mov DWORD [8+esp],ebx - mov DWORD [12+esp],ecx - mov DWORD [16+esp],ebp - mov DWORD [20+esp],esi - lea ebx,[edi-3] - mov DWORD [24+esp],edx - lea eax,[_OPENSSL_ia32cap_P] - bt DWORD [eax],26 - jnc NEAR L$003non_sse2 - mov eax,-1 - movd mm7,eax - mov esi,DWORD [8+esp] - mov edi,DWORD [12+esp] - mov ebp,DWORD [16+esp] - xor edx,edx - xor ecx,ecx - movd mm4,DWORD [edi] - movd mm5,DWORD [esi] - movd mm3,DWORD [ebp] - pmuludq mm5,mm4 - movq mm2,mm5 - movq mm0,mm5 - pand mm0,mm7 - pmuludq mm5,[20+esp] - pmuludq mm3,mm5 - paddq mm3,mm0 - movd mm1,DWORD [4+ebp] - movd mm0,DWORD [4+esi] - psrlq mm2,32 - psrlq mm3,32 - inc ecx -align 16 -L$0041st: - pmuludq mm0,mm4 - pmuludq mm1,mm5 - paddq mm2,mm0 - paddq mm3,mm1 - movq mm0,mm2 - pand mm0,mm7 - movd mm1,DWORD [4+ecx*4+ebp] - paddq mm3,mm0 - movd mm0,DWORD [4+ecx*4+esi] - psrlq mm2,32 - movd DWORD [28+ecx*4+esp],mm3 - psrlq mm3,32 - lea ecx,[1+ecx] - cmp ecx,ebx - jl NEAR L$0041st - pmuludq mm0,mm4 - pmuludq mm1,mm5 - paddq mm2,mm0 - paddq mm3,mm1 - movq mm0,mm2 - pand mm0,mm7 - paddq mm3,mm0 - movd DWORD [28+ecx*4+esp],mm3 - psrlq mm2,32 - psrlq mm3,32 - paddq mm3,mm2 - movq [32+ebx*4+esp],mm3 - inc edx -L$005outer: - xor ecx,ecx - movd mm4,DWORD [edx*4+edi] - movd mm5,DWORD [esi] - movd mm6,DWORD [32+esp] - movd mm3,DWORD [ebp] - pmuludq mm5,mm4 - paddq mm5,mm6 - movq mm0,mm5 - movq mm2,mm5 - pand mm0,mm7 - pmuludq mm5,[20+esp] - pmuludq mm3,mm5 - paddq mm3,mm0 - movd mm6,DWORD [36+esp] - movd mm1,DWORD [4+ebp] - movd mm0,DWORD [4+esi] - psrlq mm2,32 - psrlq mm3,32 - paddq mm2,mm6 - inc ecx - dec ebx -L$006inner: - pmuludq mm0,mm4 - pmuludq mm1,mm5 - paddq mm2,mm0 - paddq mm3,mm1 - movq mm0,mm2 - movd mm6,DWORD [36+ecx*4+esp] - pand mm0,mm7 - movd mm1,DWORD [4+ecx*4+ebp] - paddq mm3,mm0 - movd mm0,DWORD [4+ecx*4+esi] - psrlq mm2,32 - movd DWORD [28+ecx*4+esp],mm3 - psrlq mm3,32 - paddq mm2,mm6 - dec ebx - lea ecx,[1+ecx] - jnz NEAR L$006inner - mov ebx,ecx - pmuludq mm0,mm4 - pmuludq mm1,mm5 - paddq mm2,mm0 - paddq mm3,mm1 - movq mm0,mm2 - pand mm0,mm7 - paddq mm3,mm0 - movd DWORD [28+ecx*4+esp],mm3 - psrlq mm2,32 - psrlq mm3,32 - movd mm6,DWORD [36+ebx*4+esp] - paddq mm3,mm2 - paddq mm3,mm6 - movq [32+ebx*4+esp],mm3 - lea edx,[1+edx] - cmp edx,ebx - jle NEAR L$005outer - emms - jmp NEAR L$007common_tail -align 16 -L$003non_sse2: - mov esi,DWORD [8+esp] - lea ebp,[1+ebx] - mov edi,DWORD [12+esp] - xor ecx,ecx - mov edx,esi - and ebp,1 - sub edx,edi - lea eax,[4+ebx*4+edi] - or ebp,edx - mov edi,DWORD [edi] - jz NEAR L$008bn_sqr_mont - mov DWORD [28+esp],eax - mov eax,DWORD [esi] - xor edx,edx -align 16 -L$009mull: - mov ebp,edx - mul edi - add ebp,eax - lea ecx,[1+ecx] - adc edx,0 - mov eax,DWORD [ecx*4+esi] - cmp ecx,ebx - mov DWORD [28+ecx*4+esp],ebp - jl NEAR L$009mull - mov ebp,edx - mul edi - mov edi,DWORD [20+esp] - add eax,ebp - mov esi,DWORD [16+esp] - adc edx,0 - imul edi,DWORD [32+esp] - mov DWORD [32+ebx*4+esp],eax - xor ecx,ecx - mov DWORD [36+ebx*4+esp],edx - mov DWORD [40+ebx*4+esp],ecx - mov eax,DWORD [esi] - mul edi - add eax,DWORD [32+esp] - mov eax,DWORD [4+esi] - adc edx,0 - inc ecx - jmp NEAR L$0102ndmadd -align 16 -L$0111stmadd: - mov ebp,edx - mul edi - add ebp,DWORD [32+ecx*4+esp] - lea ecx,[1+ecx] - adc edx,0 - add ebp,eax - mov eax,DWORD [ecx*4+esi] - adc edx,0 - cmp ecx,ebx - mov DWORD [28+ecx*4+esp],ebp - jl NEAR L$0111stmadd - mov ebp,edx - mul edi - add eax,DWORD [32+ebx*4+esp] - mov edi,DWORD [20+esp] - adc edx,0 - mov esi,DWORD [16+esp] - add ebp,eax - adc edx,0 - imul edi,DWORD [32+esp] - xor ecx,ecx - add edx,DWORD [36+ebx*4+esp] - mov DWORD [32+ebx*4+esp],ebp - adc ecx,0 - mov eax,DWORD [esi] - mov DWORD [36+ebx*4+esp],edx - mov DWORD [40+ebx*4+esp],ecx - mul edi - add eax,DWORD [32+esp] - mov eax,DWORD [4+esi] - adc edx,0 - mov ecx,1 -align 16 -L$0102ndmadd: - mov ebp,edx - mul edi - add ebp,DWORD [32+ecx*4+esp] - lea ecx,[1+ecx] - adc edx,0 - add ebp,eax - mov eax,DWORD [ecx*4+esi] - adc edx,0 - cmp ecx,ebx - mov DWORD [24+ecx*4+esp],ebp - jl NEAR L$0102ndmadd - mov ebp,edx - mul edi - add ebp,DWORD [32+ebx*4+esp] - adc edx,0 - add ebp,eax - adc edx,0 - mov DWORD [28+ebx*4+esp],ebp - xor eax,eax - mov ecx,DWORD [12+esp] - add edx,DWORD [36+ebx*4+esp] - adc eax,DWORD [40+ebx*4+esp] - lea ecx,[4+ecx] - mov DWORD [32+ebx*4+esp],edx - cmp ecx,DWORD [28+esp] - mov DWORD [36+ebx*4+esp],eax - je NEAR L$007common_tail - mov edi,DWORD [ecx] - mov esi,DWORD [8+esp] - mov DWORD [12+esp],ecx - xor ecx,ecx - xor edx,edx - mov eax,DWORD [esi] - jmp NEAR L$0111stmadd -align 16 -L$008bn_sqr_mont: - mov DWORD [esp],ebx - mov DWORD [12+esp],ecx - mov eax,edi - mul edi - mov DWORD [32+esp],eax - mov ebx,edx - shr edx,1 - and ebx,1 - inc ecx -align 16 -L$012sqr: - mov eax,DWORD [ecx*4+esi] - mov ebp,edx - mul edi - add eax,ebp - lea ecx,[1+ecx] - adc edx,0 - lea ebp,[eax*2+ebx] - shr eax,31 - cmp ecx,DWORD [esp] - mov ebx,eax - mov DWORD [28+ecx*4+esp],ebp - jl NEAR L$012sqr - mov eax,DWORD [ecx*4+esi] - mov ebp,edx - mul edi - add eax,ebp - mov edi,DWORD [20+esp] - adc edx,0 - mov esi,DWORD [16+esp] - lea ebp,[eax*2+ebx] - imul edi,DWORD [32+esp] - shr eax,31 - mov DWORD [32+ecx*4+esp],ebp - lea ebp,[edx*2+eax] - mov eax,DWORD [esi] - shr edx,31 - mov DWORD [36+ecx*4+esp],ebp - mov DWORD [40+ecx*4+esp],edx - mul edi - add eax,DWORD [32+esp] - mov ebx,ecx - adc edx,0 - mov eax,DWORD [4+esi] - mov ecx,1 -align 16 -L$0133rdmadd: - mov ebp,edx - mul edi - add ebp,DWORD [32+ecx*4+esp] - adc edx,0 - add ebp,eax - mov eax,DWORD [4+ecx*4+esi] - adc edx,0 - mov DWORD [28+ecx*4+esp],ebp - mov ebp,edx - mul edi - add ebp,DWORD [36+ecx*4+esp] - lea ecx,[2+ecx] - adc edx,0 - add ebp,eax - mov eax,DWORD [ecx*4+esi] - adc edx,0 - cmp ecx,ebx - mov DWORD [24+ecx*4+esp],ebp - jl NEAR L$0133rdmadd - mov ebp,edx - mul edi - add ebp,DWORD [32+ebx*4+esp] - adc edx,0 - add ebp,eax - adc edx,0 - mov DWORD [28+ebx*4+esp],ebp - mov ecx,DWORD [12+esp] - xor eax,eax - mov esi,DWORD [8+esp] - add edx,DWORD [36+ebx*4+esp] - adc eax,DWORD [40+ebx*4+esp] - mov DWORD [32+ebx*4+esp],edx - cmp ecx,ebx - mov DWORD [36+ebx*4+esp],eax - je NEAR L$007common_tail - mov edi,DWORD [4+ecx*4+esi] - lea ecx,[1+ecx] - mov eax,edi - mov DWORD [12+esp],ecx - mul edi - add eax,DWORD [32+ecx*4+esp] - adc edx,0 - mov DWORD [32+ecx*4+esp],eax - xor ebp,ebp - cmp ecx,ebx - lea ecx,[1+ecx] - je NEAR L$014sqrlast - mov ebx,edx - shr edx,1 - and ebx,1 -align 16 -L$015sqradd: - mov eax,DWORD [ecx*4+esi] - mov ebp,edx - mul edi - add eax,ebp - lea ebp,[eax*1+eax] - adc edx,0 - shr eax,31 - add ebp,DWORD [32+ecx*4+esp] - lea ecx,[1+ecx] - adc eax,0 - add ebp,ebx - adc eax,0 - cmp ecx,DWORD [esp] - mov DWORD [28+ecx*4+esp],ebp - mov ebx,eax - jle NEAR L$015sqradd - mov ebp,edx - add edx,edx - shr ebp,31 - add edx,ebx - adc ebp,0 -L$014sqrlast: - mov edi,DWORD [20+esp] - mov esi,DWORD [16+esp] - imul edi,DWORD [32+esp] - add edx,DWORD [32+ecx*4+esp] - mov eax,DWORD [esi] - adc ebp,0 - mov DWORD [32+ecx*4+esp],edx - mov DWORD [36+ecx*4+esp],ebp - mul edi - add eax,DWORD [32+esp] - lea ebx,[ecx-1] - adc edx,0 - mov ecx,1 - mov eax,DWORD [4+esi] - jmp NEAR L$0133rdmadd -align 16 -L$007common_tail: - mov ebp,DWORD [16+esp] - mov edi,DWORD [4+esp] - lea esi,[32+esp] - mov eax,DWORD [esi] - mov ecx,ebx - xor edx,edx -align 16 -L$016sub: - sbb eax,DWORD [edx*4+ebp] - mov DWORD [edx*4+edi],eax - dec ecx - mov eax,DWORD [4+edx*4+esi] - lea edx,[1+edx] - jge NEAR L$016sub - sbb eax,0 - mov edx,-1 - xor edx,eax - jmp NEAR L$017copy -align 16 -L$017copy: - mov esi,DWORD [32+ebx*4+esp] - mov ebp,DWORD [ebx*4+edi] - mov DWORD [32+ebx*4+esp],ecx - and esi,eax - and ebp,edx - or ebp,esi - mov DWORD [ebx*4+edi],ebp - dec ebx - jge NEAR L$017copy - mov esp,DWORD [24+esp] - mov eax,1 -L$000just_leave: - pop edi - pop esi - pop ebx - pop ebp - ret -db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 -db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 -db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 -db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 -db 111,114,103,62,0 -segment .bss -common _OPENSSL_ia32cap_P 16 -%else -; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 -ret -%endif -#endif // defined(__i386__) && defined(_WIN32) -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - diff --git a/Sources/CCryptoBoringSSL/crypto/hpke/hpke.c b/Sources/CCryptoBoringSSL/crypto/hpke/hpke.c index 67ff51bb..0350ecdb 100644 --- a/Sources/CCryptoBoringSSL/crypto/hpke/hpke.c +++ b/Sources/CCryptoBoringSSL/crypto/hpke/hpke.c @@ -21,12 +21,15 @@ #include #include #include +#include #include #include #include +#include #include #include +#include "../fipsmodule/ec/internal.h" #include "../internal.h" @@ -111,7 +114,7 @@ static int hpke_labeled_expand(const EVP_MD *hkdf_md, uint8_t *out_key, const uint8_t *info, size_t info_len) { // labeledInfo = concat(I2OSP(L, 2), "HPKE-v1", suite_id, label, info) CBB labeled_info; - int ok = CBB_init(&labeled_info, 0) && + int ok = CBB_init(&labeled_info, 0) && // CBB_add_u16(&labeled_info, out_len) && add_label_string(&labeled_info, kHpkeVersionId) && CBB_add_bytes(&labeled_info, suite_id, suite_id_len) && @@ -309,6 +312,294 @@ const EVP_HPKE_KEM *EVP_hpke_x25519_hkdf_sha256(void) { return &kKEM; } +#define P256_PRIVATE_KEY_LEN 32 +#define P256_PUBLIC_KEY_LEN 65 +#define P256_PUBLIC_VALUE_LEN 65 +#define P256_SEED_LEN 32 +#define P256_SHARED_KEY_LEN 32 + +static int p256_public_from_private(uint8_t out_pub[P256_PUBLIC_VALUE_LEN], + const uint8_t priv[P256_PRIVATE_KEY_LEN]) { + const EC_GROUP *const group = EC_group_p256(); + const uint8_t kAllZeros[P256_PRIVATE_KEY_LEN] = {0}; + EC_SCALAR private_scalar; + EC_JACOBIAN public_point; + EC_AFFINE public_point_affine; + + if (CRYPTO_memcmp(kAllZeros, priv, sizeof(kAllZeros)) == 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + if (!ec_scalar_from_bytes(group, &private_scalar, priv, + P256_PRIVATE_KEY_LEN) || + !ec_point_mul_scalar_base(group, &public_point, &private_scalar) || + !ec_jacobian_to_affine(group, &public_point_affine, &public_point)) { + return 0; + } + + size_t out_len_x, out_len_y; + out_pub[0] = POINT_CONVERSION_UNCOMPRESSED; + ec_felem_to_bytes(group, &out_pub[1], &out_len_x, &public_point_affine.X); + ec_felem_to_bytes(group, &out_pub[33], &out_len_y, &public_point_affine.Y); + return 1; +} + +static int p256_init_key(EVP_HPKE_KEY *key, const uint8_t *priv_key, + size_t priv_key_len) { + if (priv_key_len != P256_PRIVATE_KEY_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + if (!p256_public_from_private(key->public_key, priv_key)) { + return 0; + } + + OPENSSL_memcpy(key->private_key, priv_key, priv_key_len); + return 1; +} + +static int p256_private_key_from_seed(uint8_t out_priv[P256_PRIVATE_KEY_LEN], + const uint8_t seed[P256_SEED_LEN]) { + // https://www.rfc-editor.org/rfc/rfc9180.html#name-derivekeypair + const uint8_t suite_id[5] = {'K', 'E', 'M', + EVP_HPKE_DHKEM_P256_HKDF_SHA256 >> 8, + EVP_HPKE_DHKEM_P256_HKDF_SHA256 & 0xff}; + + uint8_t dkp_prk[32]; + size_t dkp_prk_len; + if (!hpke_labeled_extract(EVP_sha256(), dkp_prk, &dkp_prk_len, NULL, 0, + suite_id, sizeof(suite_id), "dkp_prk", seed, + P256_SEED_LEN)) { + return 0; + } + assert(dkp_prk_len == sizeof(dkp_prk)); + + const EC_GROUP *const group = EC_group_p256(); + EC_SCALAR private_scalar; + + for (unsigned counter = 0; counter < 256; counter++) { + const uint8_t counter_byte = counter & 0xff; + if (!hpke_labeled_expand(EVP_sha256(), out_priv, P256_PRIVATE_KEY_LEN, + dkp_prk, sizeof(dkp_prk), suite_id, + sizeof(suite_id), "candidate", &counter_byte, + sizeof(counter_byte))) { + return 0; + } + + // This checks that the scalar is less than the order. + if (ec_scalar_from_bytes(group, &private_scalar, out_priv, + P256_PRIVATE_KEY_LEN)) { + return 1; + } + } + + // This happens with probability of 2^-(32*256). + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; +} + +static int p256_generate_key(EVP_HPKE_KEY *key) { + uint8_t seed[P256_SEED_LEN]; + RAND_bytes(seed, sizeof(seed)); + if (!p256_private_key_from_seed(key->private_key, seed) || + !p256_public_from_private(key->public_key, key->private_key)) { + return 0; + } + return 1; +} + +static int p256(uint8_t out_dh[P256_SHARED_KEY_LEN], + const uint8_t my_private[P256_PRIVATE_KEY_LEN], + const uint8_t their_public[P256_PUBLIC_VALUE_LEN]) { + const EC_GROUP *const group = EC_group_p256(); + EC_SCALAR private_scalar; + EC_FELEM x, y; + EC_JACOBIAN shared_point, their_point; + EC_AFFINE their_point_affine, shared_point_affine; + + if (their_public[0] != POINT_CONVERSION_UNCOMPRESSED || + !ec_felem_from_bytes(group, &x, &their_public[1], 32) || + !ec_felem_from_bytes(group, &y, &their_public[33], 32) || + !ec_point_set_affine_coordinates(group, &their_point_affine, &x, &y) || + !ec_scalar_from_bytes(group, &private_scalar, my_private, + P256_PRIVATE_KEY_LEN)) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; + } + + ec_affine_to_jacobian(group, &their_point, &their_point_affine); + if (!ec_point_mul_scalar(group, &shared_point, &their_point, + &private_scalar) || + !ec_jacobian_to_affine(group, &shared_point_affine, &shared_point)) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; + } + + size_t out_len; + ec_felem_to_bytes(group, out_dh, &out_len, &shared_point_affine.X); + assert(out_len == P256_SHARED_KEY_LEN); + return 1; +} + +static int p256_encap_with_seed(const EVP_HPKE_KEM *kem, + uint8_t *out_shared_secret, + size_t *out_shared_secret_len, uint8_t *out_enc, + size_t *out_enc_len, size_t max_enc, + const uint8_t *peer_public_key, + size_t peer_public_key_len, const uint8_t *seed, + size_t seed_len) { + if (max_enc < P256_PUBLIC_VALUE_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); + return 0; + } + if (seed_len != P256_SEED_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + uint8_t private_key[P256_PRIVATE_KEY_LEN]; + if (!p256_private_key_from_seed(private_key, seed)) { + return 0; + } + p256_public_from_private(out_enc, private_key); + + uint8_t dh[P256_SHARED_KEY_LEN]; + if (peer_public_key_len != P256_PUBLIC_VALUE_LEN || + !p256(dh, private_key, peer_public_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[2 * P256_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, out_enc, P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, peer_public_key, + P256_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_enc_len = P256_PUBLIC_VALUE_LEN; + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +static int p256_decap(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, const uint8_t *enc, + size_t enc_len) { + uint8_t dh[P256_SHARED_KEY_LEN]; + if (enc_len != P256_PUBLIC_VALUE_LEN || // + !p256(dh, key->private_key, enc)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[2 * P256_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, enc, P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, key->public_key, + P256_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +static int p256_auth_encap_with_seed( + const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, uint8_t *out_enc, size_t *out_enc_len, + size_t max_enc, const uint8_t *peer_public_key, size_t peer_public_key_len, + const uint8_t *seed, size_t seed_len) { + if (max_enc < P256_PUBLIC_VALUE_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); + return 0; + } + if (seed_len != P256_SEED_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + uint8_t private_key[P256_PRIVATE_KEY_LEN]; + if (!p256_private_key_from_seed(private_key, seed)) { + return 0; + } + p256_public_from_private(out_enc, private_key); + + uint8_t dh[2 * P256_SHARED_KEY_LEN]; + if (peer_public_key_len != P256_PUBLIC_VALUE_LEN || + !p256(dh, private_key, peer_public_key) || + !p256(dh + P256_SHARED_KEY_LEN, key->private_key, peer_public_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[3 * P256_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, out_enc, P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, peer_public_key, + P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + 2 * P256_PUBLIC_VALUE_LEN, key->public_key, + P256_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_enc_len = P256_PUBLIC_VALUE_LEN; + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +static int p256_auth_decap(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, const uint8_t *enc, + size_t enc_len, const uint8_t *peer_public_key, + size_t peer_public_key_len) { + uint8_t dh[2 * P256_SHARED_KEY_LEN]; + if (enc_len != P256_PUBLIC_VALUE_LEN || + peer_public_key_len != P256_PUBLIC_VALUE_LEN || + !p256(dh, key->private_key, enc) || + !p256(dh + P256_SHARED_KEY_LEN, key->private_key, peer_public_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[3 * P256_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, enc, P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, key->public_key, + P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + 2 * P256_PUBLIC_VALUE_LEN, peer_public_key, + P256_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +const EVP_HPKE_KEM *EVP_hpke_p256_hkdf_sha256(void) { + static const EVP_HPKE_KEM kKEM = { + /*id=*/EVP_HPKE_DHKEM_P256_HKDF_SHA256, + /*public_key_len=*/P256_PUBLIC_KEY_LEN, + /*private_key_len=*/P256_PRIVATE_KEY_LEN, + /*seed_len=*/P256_SEED_LEN, + /*enc_len=*/P256_PUBLIC_VALUE_LEN, + p256_init_key, + p256_generate_key, + p256_encap_with_seed, + p256_decap, + p256_auth_encap_with_seed, + p256_auth_decap, + }; + return &kKEM; +} + uint16_t EVP_HPKE_KEM_id(const EVP_HPKE_KEM *kem) { return kem->id; } size_t EVP_HPKE_KEM_public_key_len(const EVP_HPKE_KEM *kem) { @@ -398,7 +689,7 @@ int EVP_HPKE_KEY_public_key(const EVP_HPKE_KEY *key, uint8_t *out, } int EVP_HPKE_KEY_private_key(const EVP_HPKE_KEY *key, uint8_t *out, - size_t *out_len, size_t max_out) { + size_t *out_len, size_t max_out) { if (max_out < key->kem->private_key_len) { OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); return 0; diff --git a/Sources/CCryptoBoringSSL/crypto/internal.h b/Sources/CCryptoBoringSSL/crypto/internal.h index c1882f26..a742ef6b 100644 --- a/Sources/CCryptoBoringSSL/crypto/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/internal.h @@ -180,17 +180,29 @@ extern "C" { #endif -#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM) || \ - defined(OPENSSL_AARCH64) -// OPENSSL_cpuid_setup initializes the platform-specific feature cache. +#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_STATIC_ARMCAP) && \ + (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ + defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) +// x86, x86_64, and the ARMs need to record the result of a cpuid/getauxval call +// for the asm to work correctly, unless compiled without asm code. +#define NEED_CPUID + +// OPENSSL_cpuid_setup initializes the platform-specific feature cache. This +// function should not be called directly. Call |OPENSSL_init_cpuid| instead. void OPENSSL_cpuid_setup(void); + +// OPENSSL_init_cpuid initializes the platform-specific feature cache, if +// needed. This function is idempotent and may be called concurrently. +void OPENSSL_init_cpuid(void); +#else +OPENSSL_INLINE void OPENSSL_init_cpuid(void) {} #endif #if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \ !defined(OPENSSL_STATIC_ARMCAP) // OPENSSL_get_armcap_pointer_for_test returns a pointer to |OPENSSL_armcap_P| -// for unit tests. Any modifications to the value must be made after -// |CRYPTO_library_init| but before any other function call in BoringSSL. +// for unit tests. Any modifications to the value must be made before any other +// function call in BoringSSL. OPENSSL_EXPORT uint32_t *OPENSSL_get_armcap_pointer_for_test(void); #endif @@ -205,7 +217,9 @@ typedef __uint128_t uint128_t; // __uint128_t division depends on intrinsics in the compiler runtime. Those // intrinsics are missing in clang-cl (https://crbug.com/787617) and nanolibc. // These may be bugs in the toolchain definition, but just disable it for now. -#if !defined(_MSC_VER) && !defined(OPENSSL_NANOLIBC) +// EDK2's toolchain is missing __udivti3 (b/339380897) so cannot support +// 128-bit division currently. +#if !defined(_MSC_VER) && !defined(OPENSSL_NANOLIBC) && !defined(__EDK2_BORINGSSL__) #define BORINGSSL_CAN_DIVIDE_UINT128 #endif #endif @@ -260,9 +274,9 @@ typedef __uint128_t uint128_t; #endif #if defined(__GNUC__) || defined(__clang__) -#define OPENSSL_ATTR_PURE __attribute__((pure)) +#define OPENSSL_ATTR_CONST __attribute__((const)) #else -#define OPENSSL_ATTR_PURE +#define OPENSSL_ATTR_CONST #endif #if defined(BORINGSSL_MALLOC_FAILURE_TESTING) @@ -555,11 +569,22 @@ static inline void constant_time_conditional_memcpy(void *dst, const void *src, // |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory // ranges at |dst| and |src| must not overlap, as when calling |memcpy|. static inline void constant_time_conditional_memxor(void *dst, const void *src, - const size_t n, + size_t n, const crypto_word_t mask) { assert(!buffers_alias(dst, n, src, n)); uint8_t *out = (uint8_t *)dst; const uint8_t *in = (const uint8_t *)src; +#if defined(__GNUC__) && !defined(__clang__) + // gcc 13.2.0 doesn't automatically vectorize this loop regardless of barrier + typedef uint8_t v32u8 __attribute__((vector_size(32), aligned(1), may_alias)); + size_t n_vec = n&~(size_t)31; + v32u8 masks = ((uint8_t)mask-(v32u8){}); // broadcast + for (size_t i = 0; i < n_vec; i += 32) { + *(v32u8*)&out[i] ^= masks & *(v32u8*)&in[i]; + } + out += n_vec; + n -= n_vec; +#endif for (size_t i = 0; i < n; i++) { out[i] ^= value_barrier_w(mask) & in[i]; } @@ -609,6 +634,12 @@ static inline int constant_time_declassify_int(int v) { return value_barrier_u32(v); } +// declassify_assert behaves like |assert| but declassifies the result of +// evaluating |expr|. This allows the assertion to branch on the (presumably +// public) result, but still ensures that values leading up to the computation +// were secret. +#define declassify_assert(expr) assert(constant_time_declassify_int(expr)) + // Thread-safe initialisation. @@ -1168,6 +1199,11 @@ static inline uint64_t CRYPTO_rotr_u64(uint64_t value, int shift) { // Arithmetic functions. +// The most efficient versions of these functions on GCC and Clang depend on C11 +// |_Generic|. If we ever need to call these from C++, we'll need to add a +// variant that uses C++ overloads instead. +#if !defined(__cplusplus) + // CRYPTO_addc_* returns |x + y + carry|, and sets |*out_carry| to the carry // bit. |carry| must be zero or one. #if OPENSSL_HAS_BUILTIN(__builtin_addc) @@ -1180,13 +1216,13 @@ static inline uint64_t CRYPTO_rotr_u64(uint64_t value, int shift) { static inline uint32_t CRYPTO_addc_u32(uint32_t x, uint32_t y, uint32_t carry, uint32_t *out_carry) { - assert(carry <= 1); + declassify_assert(carry <= 1); return CRYPTO_GENERIC_ADDC(x, y, carry, out_carry); } static inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry, uint64_t *out_carry) { - assert(carry <= 1); + declassify_assert(carry <= 1); return CRYPTO_GENERIC_ADDC(x, y, carry, out_carry); } @@ -1194,7 +1230,7 @@ static inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry, static inline uint32_t CRYPTO_addc_u32(uint32_t x, uint32_t y, uint32_t carry, uint32_t *out_carry) { - assert(carry <= 1); + declassify_assert(carry <= 1); uint64_t ret = carry; ret += (uint64_t)x + y; *out_carry = (uint32_t)(ret >> 32); @@ -1203,7 +1239,7 @@ static inline uint32_t CRYPTO_addc_u32(uint32_t x, uint32_t y, uint32_t carry, static inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry, uint64_t *out_carry) { - assert(carry <= 1); + declassify_assert(carry <= 1); #if defined(BORINGSSL_HAS_UINT128) uint128_t ret = carry; ret += (uint128_t)x + y; @@ -1232,13 +1268,13 @@ static inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry, static inline uint32_t CRYPTO_subc_u32(uint32_t x, uint32_t y, uint32_t borrow, uint32_t *out_borrow) { - assert(borrow <= 1); + declassify_assert(borrow <= 1); return CRYPTO_GENERIC_SUBC(x, y, borrow, out_borrow); } static inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow, uint64_t *out_borrow) { - assert(borrow <= 1); + declassify_assert(borrow <= 1); return CRYPTO_GENERIC_SUBC(x, y, borrow, out_borrow); } @@ -1246,7 +1282,7 @@ static inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow, static inline uint32_t CRYPTO_subc_u32(uint32_t x, uint32_t y, uint32_t borrow, uint32_t *out_borrow) { - assert(borrow <= 1); + declassify_assert(borrow <= 1); uint32_t ret = x - y - borrow; *out_borrow = (x < y) | ((x == y) & borrow); return ret; @@ -1254,7 +1290,7 @@ static inline uint32_t CRYPTO_subc_u32(uint32_t x, uint32_t y, uint32_t borrow, static inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow, uint64_t *out_borrow) { - assert(borrow <= 1); + declassify_assert(borrow <= 1); uint64_t ret = x - y - borrow; *out_borrow = (x < y) | ((x == y) & borrow); return ret; @@ -1269,6 +1305,8 @@ static inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow, #define CRYPTO_subc_w CRYPTO_subc_u32 #endif +#endif // !__cplusplus + // FIPS functions. @@ -1352,21 +1390,23 @@ OPENSSL_INLINE int boringssl_fips_break_test(const char *test) { // ECX for CPUID where EAX = 1 // Bit 11 is used to indicate AMD XOP support, not SDBG // Index 2: -// EBX for CPUID where EAX = 7 +// EBX for CPUID where EAX = 7, ECX = 0 +// Bit 14 (for removed feature MPX) is used to indicate a preference for ymm +// registers over zmm even when zmm registers are supported // Index 3: -// ECX for CPUID where EAX = 7 +// ECX for CPUID where EAX = 7, ECX = 0 // -// Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the YMM and XMM -// bits in XCR0, so it is not necessary to check those. (WARNING: See caveats -// in cpu_intel.c.) +// Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the XMM, YMM, +// and AVX512 bits in XCR0, so it is not necessary to check those. (WARNING: See +// caveats in cpu_intel.c.) // // From C, this symbol should only be accessed with |OPENSSL_get_ia32cap|. extern uint32_t OPENSSL_ia32cap_P[4]; // OPENSSL_get_ia32cap initializes the library if needed and returns the |idx|th -// entry of |OPENSSL_ia32cap_P|. It is marked as a pure function so duplicate +// entry of |OPENSSL_ia32cap_P|. It is marked as a const function so duplicate // calls can be merged by the compiler, at least when indices match. -OPENSSL_ATTR_PURE uint32_t OPENSSL_get_ia32cap(int idx); +OPENSSL_ATTR_CONST uint32_t OPENSSL_get_ia32cap(int idx); // See Intel manual, volume 2A, table 3-11. @@ -1508,7 +1548,6 @@ OPENSSL_INLINE int CRYPTO_is_x86_SHA_capable(void) { // otherwise select. See chacha-x86_64.pl. // // Bonnell, Silvermont's predecessor in the Atom lineup, will also be matched by -// this. |OPENSSL_cpuid_setup| forces Knights Landing to also be matched by // this. Goldmont (Silvermont's successor in the Atom lineup) added XSAVE so it // isn't matched by this. Various sources indicate AMD first implemented MOVBE // and XSAVE at the same time in Jaguar, so it seems like AMD chips will not be @@ -1517,15 +1556,56 @@ OPENSSL_INLINE int CRYPTO_cpu_perf_is_like_silvermont(void) { // WARNING: This MUST NOT be used to guard the execution of the XSAVE // instruction. This is the "hardware supports XSAVE" bit, not the OSXSAVE bit // that indicates whether we can safely execute XSAVE. This bit may be set - // even when XSAVE is disabled (by the operating system). See the comment in - // cpu_intel.c and check how the users of this bit use it. + // even when XSAVE is disabled (by the operating system). See how the users of + // this bit use it. // - // We do not use |__XSAVE__| for static detection because the hack in - // |OPENSSL_cpuid_setup| for Knights Landing CPUs needs to override it. + // Historically, the XSAVE bit was artificially cleared on Knights Landing + // and Knights Mill chips, but as Intel has removed all support from GCC, + // LLVM, and SDE, we assume they are no longer worth special-casing. int hardware_supports_xsave = (OPENSSL_get_ia32cap(1) & (1u << 26)) != 0; return !hardware_supports_xsave && CRYPTO_is_MOVBE_capable(); } +OPENSSL_INLINE int CRYPTO_is_AVX512BW_capable(void) { +#if defined(__AVX512BW__) + return 1; +#else + return (OPENSSL_get_ia32cap(2) & (1u << 30)) != 0; +#endif +} + +OPENSSL_INLINE int CRYPTO_is_AVX512VL_capable(void) { +#if defined(__AVX512VL__) + return 1; +#else + return (OPENSSL_get_ia32cap(2) & (1u << 31)) != 0; +#endif +} + +// CRYPTO_cpu_avoid_zmm_registers returns 1 if zmm registers (512-bit vectors) +// should not be used even if the CPU supports them. +// +// Note that this reuses the bit for the removed MPX feature. +OPENSSL_INLINE int CRYPTO_cpu_avoid_zmm_registers(void) { + return (OPENSSL_get_ia32cap(2) & (1u << 14)) != 0; +} + +OPENSSL_INLINE int CRYPTO_is_VAES_capable(void) { +#if defined(__VAES__) + return 1; +#else + return (OPENSSL_get_ia32cap(3) & (1u << 9)) != 0; +#endif +} + +OPENSSL_INLINE int CRYPTO_is_VPCLMULQDQ_capable(void) { +#if defined(__VPCLMULQDQ__) + return 1; +#else + return (OPENSSL_get_ia32cap(3) & (1u << 10)) != 0; +#endif +} + #endif // OPENSSL_X86 || OPENSSL_X86_64 #if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) @@ -1535,9 +1615,9 @@ OPENSSL_INLINE int CRYPTO_cpu_perf_is_like_silvermont(void) { extern uint32_t OPENSSL_armcap_P; // OPENSSL_get_armcap initializes the library if needed and returns ARM CPU -// capabilities. It is marked as a pure function so duplicate calls can be -// merged by the compiler, at least when indices match. -OPENSSL_ATTR_PURE uint32_t OPENSSL_get_armcap(void); +// capabilities. It is marked as a const function so duplicate calls can be +// merged by the compiler. +OPENSSL_ATTR_CONST uint32_t OPENSSL_get_armcap(void); // We do not detect any features at runtime on several 32-bit Arm platforms. // Apple platforms and OpenBSD require NEON and moved to 64-bit to pick up Armv8 diff --git a/Sources/CCryptoBoringSSL/crypto/kyber/kyber.c b/Sources/CCryptoBoringSSL/crypto/kyber/kyber.c index c84c4ed8..bd860118 100644 --- a/Sources/CCryptoBoringSSL/crypto/kyber/kyber.c +++ b/Sources/CCryptoBoringSSL/crypto/kyber/kyber.c @@ -12,6 +12,7 @@ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#define OPENSSL_UNSTABLE_EXPERIMENTAL_KYBER #include #include diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md4/md4.c b/Sources/CCryptoBoringSSL/crypto/md4/md4.c similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md4/md4.c rename to Sources/CCryptoBoringSSL/crypto/md4/md4.c index 808c4667..2782e991 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md4/md4.c +++ b/Sources/CCryptoBoringSSL/crypto/md4/md4.c @@ -59,8 +59,8 @@ #include #include -#include "../../internal.h" -#include "../digest/md32_common.h" +#include "../internal.h" +#include "../crypto/fipsmodule/digest/md32_common.h" uint8_t *MD4(const uint8_t *data, size_t len, uint8_t out[MD4_DIGEST_LENGTH]) { @@ -231,10 +231,3 @@ void md4_block_data_order(uint32_t *state, const uint8_t *data, size_t num) { D = state[3] += D; } } - -#undef F -#undef G -#undef H -#undef R0 -#undef R1 -#undef R2 diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5/internal.h b/Sources/CCryptoBoringSSL/crypto/md5/internal.h similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md5/internal.h rename to Sources/CCryptoBoringSSL/crypto/md5/internal.h diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5/md5.c b/Sources/CCryptoBoringSSL/crypto/md5/md5.c similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md5/md5.c rename to Sources/CCryptoBoringSSL/crypto/md5/md5.c index 5394eb8f..fccbd6eb 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5/md5.c +++ b/Sources/CCryptoBoringSSL/crypto/md5/md5.c @@ -60,8 +60,8 @@ #include -#include "../../internal.h" -#include "../digest/md32_common.h" +#include "../internal.h" +#include "../fipsmodule/digest/md32_common.h" #include "internal.h" diff --git a/Sources/CCryptoBoringSSL/crypto/mem.c b/Sources/CCryptoBoringSSL/crypto/mem.c index 5b6298c1..446b9d2d 100644 --- a/Sources/CCryptoBoringSSL/crypto/mem.c +++ b/Sources/CCryptoBoringSSL/crypto/mem.c @@ -94,7 +94,11 @@ static void __asan_unpoison_memory_region(const void *addr, size_t size) {} // Windows doesn't really support weak symbols as of May 2019, and Clang on // Windows will emit strong symbols instead. See // https://bugs.llvm.org/show_bug.cgi?id=37598 -#if defined(__ELF__) && defined(__GNUC__) +// +// EDK2 targets UEFI but builds as ELF and then translates the binary to +// COFF(!). Thus it builds with __ELF__ defined but cannot actually cope with +// weak symbols. +#if !defined(__EDK2_BORINGSSL__) && defined(__ELF__) && defined(__GNUC__) #define WEAK_SYMBOL_FUNC(rettype, name, args) \ rettype name args __attribute__((weak)); #else @@ -242,7 +246,7 @@ void *OPENSSL_malloc(size_t size) { __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); return ((uint8_t *)ptr) + OPENSSL_MALLOC_PREFIX; - err: +err: // This only works because ERR does not call OPENSSL_malloc. OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); return NULL; @@ -398,13 +402,8 @@ char *OPENSSL_strdup(const char *s) { if (s == NULL) { return NULL; } - const size_t len = strlen(s) + 1; - char *ret = OPENSSL_malloc(len); - if (ret == NULL) { - return NULL; - } - OPENSSL_memcpy(ret, s, len); - return ret; + // Copy the NUL terminator. + return OPENSSL_memdup(s, strlen(s) + 1); } int OPENSSL_isalpha(int c) { @@ -528,7 +527,7 @@ int OPENSSL_vasprintf_internal(char **str, const char *format, va_list args, *str = candidate; return ret; - err: +err: deallocate(candidate); *str = NULL; errno = ENOMEM; diff --git a/Sources/CCryptoBoringSSL/crypto/mldsa/internal.h b/Sources/CCryptoBoringSSL/crypto/mldsa/internal.h new file mode 100644 index 00000000..dca55274 --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/mldsa/internal.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2024, Google LLC + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_CRYPTO_MLDSA_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_MLDSA_INTERNAL_H + +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + + +// MLDSA_SIGNATURE_RANDOMIZER_BYTES is the number of bytes of uniformly +// random entropy necessary to generate a signature in randomized mode. +#define MLDSA_SIGNATURE_RANDOMIZER_BYTES 32 + +// MLDSA65_generate_key_external_entropy generates a public/private key pair +// using the given seed, writes the encoded public key to +// |out_encoded_public_key| and sets |out_private_key| to the private key. +// It returns 1 on success and 0 on failure. +OPENSSL_EXPORT int MLDSA65_generate_key_external_entropy( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + struct MLDSA65_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]); + +// MLDSA65_sign_internal signs |msg| using |private_key| and writes the +// signature to |out_encoded_signature|. The |context_prefix| and |context| are +// prefixed to the message, in that order, before signing. The |randomizer| +// value can be set to zero bytes in order to make a deterministic signature, or +// else filled with entropy for the usual |MLDSA_sign| behavior. It returns 1 on +// success and 0 on error. +OPENSSL_EXPORT int MLDSA65_sign_internal( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const struct MLDSA65_private_key *private_key, const uint8_t *msg, + size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len, + const uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES]); + +// MLDSA65_verify_internal verifies that |encoded_signature| is a valid +// signature of |msg| by |public_key|. The |context_prefix| and |context| are +// prefixed to the message before verification, in that order. It returns 1 on +// success and 0 on error. +OPENSSL_EXPORT int MLDSA65_verify_internal( + const struct MLDSA65_public_key *public_key, + const uint8_t encoded_signature[MLDSA65_SIGNATURE_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix, + size_t context_prefix_len, const uint8_t *context, size_t context_len); + +// MLDSA65_marshal_private_key serializes |private_key| to |out| in the +// NIST format for ML-DSA-65 private keys. It returns 1 on success or 0 +// on allocation error. +OPENSSL_EXPORT int MLDSA65_marshal_private_key( + CBB *out, const struct MLDSA65_private_key *private_key); + + +#if defined(__cplusplus) +} // extern C +#endif + +#endif // OPENSSL_HEADER_CRYPTO_MLDSA_INTERNAL_H diff --git a/Sources/CCryptoBoringSSL/crypto/mldsa/mldsa.c b/Sources/CCryptoBoringSSL/crypto/mldsa/mldsa.c new file mode 100644 index 00000000..8454debd --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/mldsa/mldsa.c @@ -0,0 +1,1687 @@ +/* Copyright (c) 2024, Google LLC + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include + +#include +#include + +#include +#include +#include + +#include "../internal.h" +#include "../keccak/internal.h" +#include "./internal.h" + +#define DEGREE 256 +#define K 6 +#define L 5 +#define ETA 4 +#define TAU 49 +#define BETA 196 +#define OMEGA 55 + +#define RHO_BYTES 32 +#define SIGMA_BYTES 64 +#define K_BYTES 32 +#define TR_BYTES 64 +#define MU_BYTES 64 +#define RHO_PRIME_BYTES 64 +#define LAMBDA_BITS 192 +#define LAMBDA_BYTES (LAMBDA_BITS / 8) + +// 2^23 - 2^13 + 1 +static const uint32_t kPrime = 8380417; +// Inverse of -kPrime modulo 2^32 +static const uint32_t kPrimeNegInverse = 4236238847; +static const int kDroppedBits = 13; +static const uint32_t kHalfPrime = (8380417 - 1) / 2; +static const uint32_t kGamma1 = 1 << 19; +static const uint32_t kGamma2 = (8380417 - 1) / 32; +// 256^-1 mod kPrime, in Montgomery form. +static const uint32_t kInverseDegreeMontgomery = 41978; + +typedef struct scalar { + uint32_t c[DEGREE]; +} scalar; + +typedef struct vectork { + scalar v[K]; +} vectork; + +typedef struct vectorl { + scalar v[L]; +} vectorl; + +typedef struct matrix { + scalar v[K][L]; +} matrix; + +/* Arithmetic */ + +// This bit of Python will be referenced in some of the following comments: +// +// q = 8380417 +// # Inverse of -q modulo 2^32 +// q_neg_inverse = 4236238847 +// # 2^64 modulo q +// montgomery_square = 2365951 +// +// def bitreverse(i): +// ret = 0 +// for n in range(8): +// bit = i & 1 +// ret <<= 1 +// ret |= bit +// i >>= 1 +// return ret +// +// def montgomery_reduce(x): +// a = (x * q_neg_inverse) % 2**32 +// b = x + a * q +// assert b & 0xFFFF_FFFF == 0 +// c = b >> 32 +// assert c < q +// return c +// +// def montgomery_transform(x): +// return montgomery_reduce(x * montgomery_square) + +// kNTTRootsMontgomery = [ +// montgomery_transform(pow(1753, bitreverse(i), q)) for i in range(256) +// ] +static const uint32_t kNTTRootsMontgomery[256] = { + 4193792, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, + 1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, + 2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, + 6262231, 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005, + 2706023, 95776, 3077325, 3530437, 6718724, 4788269, 5842901, 3915439, + 4519302, 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118, + 6681150, 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596, + 811944, 531354, 954230, 3881043, 3900724, 5823537, 2071892, 5582638, + 4450022, 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196, + 7122806, 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922, + 3412210, 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370, + 7709315, 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987, + 5037034, 264944, 508951, 3097992, 44288, 7280319, 904516, 3958618, + 4656075, 8371839, 1653064, 5130689, 2389356, 8169440, 759969, 7063561, + 189548, 4827145, 3159746, 6529015, 5971092, 8202977, 1315589, 1341330, + 1285669, 6795489, 7567685, 6940675, 5361315, 4499357, 4751448, 3839961, + 2091667, 3407706, 2316500, 3817976, 5037939, 2244091, 5933984, 4817955, + 266997, 2434439, 7144689, 3513181, 4860065, 4621053, 7183191, 5187039, + 900702, 1859098, 909542, 819034, 495491, 6767243, 8337157, 7857917, + 7725090, 5257975, 2031748, 3207046, 4823422, 7855319, 7611795, 4784579, + 342297, 286988, 5942594, 4108315, 3437287, 5038140, 1735879, 203044, + 2842341, 2691481, 5790267, 1265009, 4055324, 1247620, 2486353, 1595974, + 4613401, 1250494, 2635921, 4832145, 5386378, 1869119, 1903435, 7329447, + 7047359, 1237275, 5062207, 6950192, 7929317, 1312455, 3306115, 6417775, + 7100756, 1917081, 5834105, 7005614, 1500165, 777191, 2235880, 3406031, + 7838005, 5548557, 6709241, 6533464, 5796124, 4656147, 594136, 4603424, + 6366809, 2432395, 2454455, 8215696, 1957272, 3369112, 185531, 7173032, + 5196991, 162844, 1616392, 3014001, 810149, 1652634, 4686184, 6581310, + 5341501, 3523897, 3866901, 269760, 2213111, 7404533, 1717735, 472078, + 7953734, 1723600, 6577327, 1910376, 6712985, 7276084, 8119771, 4546524, + 5441381, 6144432, 7959518, 6094090, 183443, 7403526, 1612842, 4834730, + 7826001, 3919660, 8332111, 7018208, 3937738, 1400424, 7534263, 1976782}; + +// Reduces x mod kPrime in constant time, where 0 <= x < 2*kPrime. +static uint32_t reduce_once(uint32_t x) { + declassify_assert(x < 2 * kPrime); + // return x < kPrime ? x : x - kPrime; + return constant_time_select_int(constant_time_lt_w(x, kPrime), x, x - kPrime); +} + +// Returns the absolute value in constant time. +static uint32_t abs_signed(uint32_t x) { + // return is_positive(x) ? x : -x; + // Note: MSVC doesn't like applying the unary minus operator to unsigned types + // (warning C4146), so we write the negation as a bitwise not plus one + // (assuming two's complement representation). + return constant_time_select_int(constant_time_lt_w(x, 0x80000000), x, 0u - x); +} + +// Returns the absolute value modulo kPrime. +static uint32_t abs_mod_prime(uint32_t x) { + declassify_assert(x < kPrime); + // return x > kHalfPrime ? kPrime - x : x; + return constant_time_select_int(constant_time_lt_w(kHalfPrime, x), kPrime - x, + x); +} + +// Returns the maximum of two values in constant time. +static uint32_t maximum(uint32_t x, uint32_t y) { + // return x < y ? y : x; + return constant_time_select_int(constant_time_lt_w(x, y), y, x); +} + +static uint32_t mod_sub(uint32_t a, uint32_t b) { + declassify_assert(a < kPrime); + declassify_assert(b < kPrime); + return reduce_once(kPrime + a - b); +} + +static void scalar_add(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = reduce_once(lhs->c[i] + rhs->c[i]); + } +} + +static void scalar_sub(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = mod_sub(lhs->c[i], rhs->c[i]); + } +} + +static uint32_t reduce_montgomery(uint64_t x) { + declassify_assert(x <= ((uint64_t)kPrime << 32)); + uint64_t a = (uint32_t)x * kPrimeNegInverse; + uint64_t b = x + a * kPrime; + declassify_assert((b & 0xffffffff) == 0); + uint32_t c = b >> 32; + return reduce_once(c); +} + +// Multiply two scalars in the number theoretically transformed state. +static void scalar_mult(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = reduce_montgomery((uint64_t)lhs->c[i] * (uint64_t)rhs->c[i]); + } +} + +// In place number theoretic transform of a given scalar. +// +// FIPS 204, Algorithm 41 (`NTT`). +static void scalar_ntt(scalar *s) { + // Step: 1, 2, 4, 8, ..., 128 + // Offset: 128, 64, 32, 16, ..., 1 + int offset = DEGREE; + for (int step = 1; step < DEGREE; step <<= 1) { + offset >>= 1; + int k = 0; + for (int i = 0; i < step; i++) { + assert(k == 2 * offset * i); + const uint32_t step_root = kNTTRootsMontgomery[step + i]; + for (int j = k; j < k + offset; j++) { + uint32_t even = s->c[j]; + // |reduce_montgomery| works on values up to kPrime*R and R > 2*kPrime. + // |step_root| < kPrime because it's static data. |s->c[...]| is < + // kPrime by the invariants of that struct. + uint32_t odd = + reduce_montgomery((uint64_t)step_root * (uint64_t)s->c[j + offset]); + s->c[j] = reduce_once(odd + even); + s->c[j + offset] = mod_sub(even, odd); + } + k += 2 * offset; + } + } +} + +// In place inverse number theoretic transform of a given scalar. +// +// FIPS 204, Algorithm 42 (`NTT^-1`). +static void scalar_inverse_ntt(scalar *s) { + // Step: 128, 64, 32, 16, ..., 1 + // Offset: 1, 2, 4, 8, ..., 128 + int step = DEGREE; + for (int offset = 1; offset < DEGREE; offset <<= 1) { + step >>= 1; + int k = 0; + for (int i = 0; i < step; i++) { + assert(k == 2 * offset * i); + const uint32_t step_root = + kPrime - kNTTRootsMontgomery[step + (step - 1 - i)]; + for (int j = k; j < k + offset; j++) { + uint32_t even = s->c[j]; + uint32_t odd = s->c[j + offset]; + s->c[j] = reduce_once(odd + even); + + // |reduce_montgomery| works on values up to kPrime*R and R > 2*kPrime. + // kPrime + even < 2*kPrime because |even| < kPrime, by the invariants + // of that structure. Thus kPrime + even - odd < 2*kPrime because odd >= + // 0, because it's unsigned and less than kPrime. Lastly step_root < + // kPrime, because |kNTTRootsMontgomery| is static data. + s->c[j + offset] = reduce_montgomery((uint64_t)step_root * + (uint64_t)(kPrime + even - odd)); + } + k += 2 * offset; + } + } + for (int i = 0; i < DEGREE; i++) { + s->c[i] = reduce_montgomery((uint64_t)s->c[i] * + (uint64_t)kInverseDegreeMontgomery); + } +} + +static void vectork_zero(vectork *out) { OPENSSL_memset(out, 0, sizeof(*out)); } + +static void vectork_add(vectork *out, const vectork *lhs, const vectork *rhs) { + for (int i = 0; i < K; i++) { + scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]); + } +} + +static void vectork_sub(vectork *out, const vectork *lhs, const vectork *rhs) { + for (int i = 0; i < K; i++) { + scalar_sub(&out->v[i], &lhs->v[i], &rhs->v[i]); + } +} + +static void vectork_mult_scalar(vectork *out, const vectork *lhs, + const scalar *rhs) { + for (int i = 0; i < K; i++) { + scalar_mult(&out->v[i], &lhs->v[i], rhs); + } +} + +static void vectork_ntt(vectork *a) { + for (int i = 0; i < K; i++) { + scalar_ntt(&a->v[i]); + } +} + +static void vectork_inverse_ntt(vectork *a) { + for (int i = 0; i < K; i++) { + scalar_inverse_ntt(&a->v[i]); + } +} + +static void vectorl_add(vectorl *out, const vectorl *lhs, const vectorl *rhs) { + for (int i = 0; i < L; i++) { + scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]); + } +} + +static void vectorl_mult_scalar(vectorl *out, const vectorl *lhs, + const scalar *rhs) { + for (int i = 0; i < L; i++) { + scalar_mult(&out->v[i], &lhs->v[i], rhs); + } +} + +static void vectorl_ntt(vectorl *a) { + for (int i = 0; i < L; i++) { + scalar_ntt(&a->v[i]); + } +} + +static void vectorl_inverse_ntt(vectorl *a) { + for (int i = 0; i < L; i++) { + scalar_inverse_ntt(&a->v[i]); + } +} + +static void matrix_mult(vectork *out, const matrix *m, const vectorl *a) { + vectork_zero(out); + for (int i = 0; i < K; i++) { + for (int j = 0; j < L; j++) { + scalar product; + scalar_mult(&product, &m->v[i][j], &a->v[j]); + scalar_add(&out->v[i], &out->v[i], &product); + } + } +} + +/* Rounding & hints */ + +// FIPS 204, Algorithm 35 (`Power2Round`). +static void power2_round(uint32_t *r1, uint32_t *r0, uint32_t r) { + *r1 = r >> kDroppedBits; + *r0 = r - (*r1 << kDroppedBits); + + uint32_t r0_adjusted = mod_sub(*r0, 1 << kDroppedBits); + uint32_t r1_adjusted = *r1 + 1; + + // Mask is set iff r0 > 2^(dropped_bits - 1). + crypto_word_t mask = + constant_time_lt_w((uint32_t)(1 << (kDroppedBits - 1)), *r0); + // r0 = mask ? r0_adjusted : r0 + *r0 = constant_time_select_int(mask, r0_adjusted, *r0); + // r1 = mask ? r1_adjusted : r1 + *r1 = constant_time_select_int(mask, r1_adjusted, *r1); +} + +// Scale back previously rounded value. +static void scale_power2_round(uint32_t *out, uint32_t r1) { + // Pre-condition: 0 <= r1 <= 2^10 - 1 + assert(r1 < (1u << 10)); + + *out = r1 << kDroppedBits; + + // Post-condition: 0 <= out <= 2^23 - 2^13 = kPrime - 1 + assert(*out < kPrime); +} + +// FIPS 204, Algorithm 37 (`HighBits`). +static uint32_t high_bits(uint32_t x) { + // Reference description (given 0 <= x < q): + // + // ``` + // int32_t r0 = x mod+- (2 * kGamma2); + // if (x - r0 == q - 1) { + // return 0; + // } else { + // return (x - r0) / (2 * kGamma2); + // } + // ``` + // + // Below is the formula taken from the reference implementation. + // + // Here, kGamma2 == 2^18 - 2^8 + // This returns ((ceil(x / 2^7) * (2^10 + 1) + 2^21) / 2^22) mod 2^4 + uint32_t r1 = (x + 127) >> 7; + r1 = (r1 * 1025 + (1 << 21)) >> 22; + r1 &= 15; + return r1; +} + +// FIPS 204, Algorithm 36 (`Decompose`). +static void decompose(uint32_t *r1, int32_t *r0, uint32_t r) { + *r1 = high_bits(r); + + *r0 = r; + *r0 -= *r1 * 2 * (int32_t)kGamma2; + *r0 -= (((int32_t)kHalfPrime - *r0) >> 31) & (int32_t)kPrime; +} + +// FIPS 204, Algorithm 38 (`LowBits`). +static int32_t low_bits(uint32_t x) { + uint32_t r1; + int32_t r0; + decompose(&r1, &r0, x); + return r0; +} + +// FIPS 204, Algorithm 39 (`MakeHint`). +// +// In the spec this takes two arguments, z and r, and is called with +// z = -ct0 +// r = w - cs2 + ct0 +// +// It then computes HighBits (algorithm 37) of z and z+r. But z+r is just w - +// cs2, so this takes three arguments and saves an addition. +static int32_t make_hint(uint32_t ct0, uint32_t cs2, uint32_t w) { + uint32_t r_plus_z = mod_sub(w, cs2); + uint32_t r = reduce_once(r_plus_z + ct0); + return high_bits(r) != high_bits(r_plus_z); +} + +// FIPS 204, Algorithm 40 (`UseHint`). +static uint32_t use_hint_vartime(uint32_t h, uint32_t r) { + uint32_t r1; + int32_t r0; + decompose(&r1, &r0, r); + + if (h) { + if (r0 > 0) { + // m = 16, thus |mod m| in the spec turns into |& 15|. + return (r1 + 1) & 15; + } else { + return (r1 - 1) & 15; + } + } + return r1; +} + +static void scalar_power2_round(scalar *s1, scalar *s0, const scalar *s) { + for (int i = 0; i < DEGREE; i++) { + power2_round(&s1->c[i], &s0->c[i], s->c[i]); + } +} + +static void scalar_scale_power2_round(scalar *out, const scalar *in) { + for (int i = 0; i < DEGREE; i++) { + scale_power2_round(&out->c[i], in->c[i]); + } +} + +static void scalar_high_bits(scalar *out, const scalar *in) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = high_bits(in->c[i]); + } +} + +static void scalar_low_bits(scalar *out, const scalar *in) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = low_bits(in->c[i]); + } +} + +static void scalar_max(uint32_t *max, const scalar *s) { + for (int i = 0; i < DEGREE; i++) { + uint32_t abs = abs_mod_prime(s->c[i]); + *max = maximum(*max, abs); + } +} + +static void scalar_max_signed(uint32_t *max, const scalar *s) { + for (int i = 0; i < DEGREE; i++) { + uint32_t abs = abs_signed(s->c[i]); + *max = maximum(*max, abs); + } +} + +static void scalar_make_hint(scalar *out, const scalar *ct0, const scalar *cs2, + const scalar *w) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = make_hint(ct0->c[i], cs2->c[i], w->c[i]); + } +} + +static void scalar_use_hint_vartime(scalar *out, const scalar *h, + const scalar *r) { + for (int i = 0; i < DEGREE; i++) { + out->c[i] = use_hint_vartime(h->c[i], r->c[i]); + } +} + +static void vectork_power2_round(vectork *t1, vectork *t0, const vectork *t) { + for (int i = 0; i < K; i++) { + scalar_power2_round(&t1->v[i], &t0->v[i], &t->v[i]); + } +} + +static void vectork_scale_power2_round(vectork *out, const vectork *in) { + for (int i = 0; i < K; i++) { + scalar_scale_power2_round(&out->v[i], &in->v[i]); + } +} + +static void vectork_high_bits(vectork *out, const vectork *in) { + for (int i = 0; i < K; i++) { + scalar_high_bits(&out->v[i], &in->v[i]); + } +} + +static void vectork_low_bits(vectork *out, const vectork *in) { + for (int i = 0; i < K; i++) { + scalar_low_bits(&out->v[i], &in->v[i]); + } +} + +static uint32_t vectork_max(const vectork *a) { + uint32_t max = 0; + for (int i = 0; i < K; i++) { + scalar_max(&max, &a->v[i]); + } + return max; +} + +static uint32_t vectork_max_signed(const vectork *a) { + uint32_t max = 0; + for (int i = 0; i < K; i++) { + scalar_max_signed(&max, &a->v[i]); + } + return max; +} + +// The input vector contains only zeroes and ones. +static size_t vectork_count_ones(const vectork *a) { + size_t count = 0; + for (int i = 0; i < K; i++) { + for (int j = 0; j < DEGREE; j++) { + count += a->v[i].c[j]; + } + } + return count; +} + +static void vectork_make_hint(vectork *out, const vectork *ct0, + const vectork *cs2, const vectork *w) { + for (int i = 0; i < K; i++) { + scalar_make_hint(&out->v[i], &ct0->v[i], &cs2->v[i], &w->v[i]); + } +} + +static void vectork_use_hint_vartime(vectork *out, const vectork *h, + const vectork *r) { + for (int i = 0; i < K; i++) { + scalar_use_hint_vartime(&out->v[i], &h->v[i], &r->v[i]); + } +} + +static uint32_t vectorl_max(const vectorl *a) { + uint32_t max = 0; + for (int i = 0; i < L; i++) { + scalar_max(&max, &a->v[i]); + } + return max; +} + +/* Bit packing */ + +// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 4. +static void scalar_encode_4(uint8_t out[128], const scalar *s) { + // Every two elements lands on a byte boundary. + static_assert(DEGREE % 2 == 0, "DEGREE must be a multiple of 2"); + for (int i = 0; i < DEGREE / 2; i++) { + uint32_t a = s->c[2 * i]; + uint32_t b = s->c[2 * i + 1]; + declassify_assert(a < 16); + declassify_assert(b < 16); + out[i] = a | (b << 4); + } +} + +// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 10. +static void scalar_encode_10(uint8_t out[320], const scalar *s) { + // Every four elements lands on a byte boundary. + static_assert(DEGREE % 4 == 0, "DEGREE must be a multiple of 4"); + for (int i = 0; i < DEGREE / 4; i++) { + uint32_t a = s->c[4 * i]; + uint32_t b = s->c[4 * i + 1]; + uint32_t c = s->c[4 * i + 2]; + uint32_t d = s->c[4 * i + 3]; + declassify_assert(a < 1024); + declassify_assert(b < 1024); + declassify_assert(c < 1024); + declassify_assert(d < 1024); + out[5 * i] = (uint8_t)a; + out[5 * i + 1] = (uint8_t)((a >> 8) | (b << 2)); + out[5 * i + 2] = (uint8_t)((b >> 6) | (c << 4)); + out[5 * i + 3] = (uint8_t)((c >> 4) | (d << 6)); + out[5 * i + 4] = (uint8_t)(d >> 2); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 4 and b = +// 2^19. +static void scalar_encode_signed_4_eta(uint8_t out[128], const scalar *s) { + // Every two elements lands on a byte boundary. + static_assert(DEGREE % 2 == 0, "DEGREE must be a multiple of 2"); + for (int i = 0; i < DEGREE / 2; i++) { + uint32_t a = mod_sub(ETA, s->c[2 * i]); + uint32_t b = mod_sub(ETA, s->c[2 * i + 1]); + declassify_assert(a < 16); + declassify_assert(b < 16); + out[i] = a | (b << 4); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 13 and b = +// 2^12. +static void scalar_encode_signed_13_12(uint8_t out[416], const scalar *s) { + static const uint32_t kMax = 1u << 12; + // Every two elements lands on a byte boundary. + static_assert(DEGREE % 8 == 0, "DEGREE must be a multiple of 8"); + for (int i = 0; i < DEGREE / 8; i++) { + uint32_t a = mod_sub(kMax, s->c[8 * i]); + uint32_t b = mod_sub(kMax, s->c[8 * i + 1]); + uint32_t c = mod_sub(kMax, s->c[8 * i + 2]); + uint32_t d = mod_sub(kMax, s->c[8 * i + 3]); + uint32_t e = mod_sub(kMax, s->c[8 * i + 4]); + uint32_t f = mod_sub(kMax, s->c[8 * i + 5]); + uint32_t g = mod_sub(kMax, s->c[8 * i + 6]); + uint32_t h = mod_sub(kMax, s->c[8 * i + 7]); + declassify_assert(a < (1u << 13)); + declassify_assert(b < (1u << 13)); + declassify_assert(c < (1u << 13)); + declassify_assert(d < (1u << 13)); + declassify_assert(e < (1u << 13)); + declassify_assert(f < (1u << 13)); + declassify_assert(g < (1u << 13)); + declassify_assert(h < (1u << 13)); + a |= b << 13; + a |= c << 26; + c >>= 6; + c |= d << 7; + c |= e << 20; + e >>= 12; + e |= f << 1; + e |= g << 14; + e |= h << 27; + h >>= 5; + OPENSSL_memcpy(&out[13 * i], &a, sizeof(a)); + OPENSSL_memcpy(&out[13 * i + 4], &c, sizeof(c)); + OPENSSL_memcpy(&out[13 * i + 8], &e, sizeof(e)); + OPENSSL_memcpy(&out[13 * i + 12], &h, 1); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 20 and b = +// 2^19. +static void scalar_encode_signed_20_19(uint8_t out[640], const scalar *s) { + static const uint32_t kMax = 1u << 19; + // Every two elements lands on a byte boundary. + static_assert(DEGREE % 4 == 0, "DEGREE must be a multiple of 4"); + for (int i = 0; i < DEGREE / 4; i++) { + uint32_t a = mod_sub(kMax, s->c[4 * i]); + uint32_t b = mod_sub(kMax, s->c[4 * i + 1]); + uint32_t c = mod_sub(kMax, s->c[4 * i + 2]); + uint32_t d = mod_sub(kMax, s->c[4 * i + 3]); + declassify_assert(a < (1u << 20)); + declassify_assert(b < (1u << 20)); + declassify_assert(c < (1u << 20)); + declassify_assert(d < (1u << 20)); + a |= b << 20; + b >>= 12; + b |= c << 8; + b |= d << 28; + d >>= 4; + OPENSSL_memcpy(&out[10 * i], &a, sizeof(a)); + OPENSSL_memcpy(&out[10 * i + 4], &b, sizeof(b)); + OPENSSL_memcpy(&out[10 * i + 8], &d, 2); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). +static void scalar_encode_signed(uint8_t *out, const scalar *s, int bits, + uint32_t max) { + if (bits == 4) { + assert(max == ETA); + scalar_encode_signed_4_eta(out, s); + } else if (bits == 20) { + assert(max == 1u << 19); + scalar_encode_signed_20_19(out, s); + } else { + assert(bits == 13); + assert(max == 1u << 12); + scalar_encode_signed_13_12(out, s); + } +} + +// FIPS 204, Algorithm 18 (`SimpleBitUnpack`). Specialized for bitlen(b) == 10. +static void scalar_decode_10(scalar *out, const uint8_t in[320]) { + uint32_t v; + static_assert(DEGREE % 4 == 0, "DEGREE must be a multiple of 4"); + for (int i = 0; i < DEGREE / 4; i++) { + OPENSSL_memcpy(&v, &in[5 * i], sizeof(v)); + out->c[4 * i] = v & 0x3ff; + out->c[4 * i + 1] = (v >> 10) & 0x3ff; + out->c[4 * i + 2] = (v >> 20) & 0x3ff; + out->c[4 * i + 3] = (v >> 30) | (((uint32_t)in[5 * i + 4]) << 2); + } +} + +// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 4 and b = +// eta. +static int scalar_decode_signed_4_eta(scalar *out, const uint8_t in[128]) { + uint32_t v; + static_assert(DEGREE % 8 == 0, "DEGREE must be a multiple of 8"); + for (int i = 0; i < DEGREE / 8; i++) { + OPENSSL_memcpy(&v, &in[4 * i], sizeof(v)); + static_assert(ETA == 4, "ETA must be 4"); + // None of the nibbles may be >= 9. So if the MSB of any nibble is set, none + // of the other bits may be set. First, select all the MSBs. + const uint32_t msbs = v & 0x88888888u; + // For each nibble where the MSB is set, form a mask of all the other bits. + const uint32_t mask = (msbs >> 1) | (msbs >> 2) | (msbs >> 3); + // A nibble is only out of range in the case of invalid input, in which case + // it is okay to leak the value. + if (constant_time_declassify_int((mask & v) != 0)) { + return 0; + } + + out->c[i * 8] = mod_sub(ETA, v & 15); + out->c[i * 8 + 1] = mod_sub(ETA, (v >> 4) & 15); + out->c[i * 8 + 2] = mod_sub(ETA, (v >> 8) & 15); + out->c[i * 8 + 3] = mod_sub(ETA, (v >> 12) & 15); + out->c[i * 8 + 4] = mod_sub(ETA, (v >> 16) & 15); + out->c[i * 8 + 5] = mod_sub(ETA, (v >> 20) & 15); + out->c[i * 8 + 6] = mod_sub(ETA, (v >> 24) & 15); + out->c[i * 8 + 7] = mod_sub(ETA, v >> 28); + } + return 1; +} + +// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 13 and b = +// 2^12. +static void scalar_decode_signed_13_12(scalar *out, const uint8_t in[416]) { + static const uint32_t kMax = 1u << 12; + static const uint32_t k13Bits = (1u << 13) - 1; + static const uint32_t k7Bits = (1u << 7) - 1; + + uint32_t a, b, c; + uint8_t d; + static_assert(DEGREE % 8 == 0, "DEGREE must be a multiple of 8"); + for (int i = 0; i < DEGREE / 8; i++) { + OPENSSL_memcpy(&a, &in[13 * i], sizeof(a)); + OPENSSL_memcpy(&b, &in[13 * i + 4], sizeof(b)); + OPENSSL_memcpy(&c, &in[13 * i + 8], sizeof(c)); + d = in[13 * i + 12]; + + // It's not possible for a 13-bit number to be out of range when the max is + // 2^12. + out->c[i * 8] = mod_sub(kMax, a & k13Bits); + out->c[i * 8 + 1] = mod_sub(kMax, (a >> 13) & k13Bits); + out->c[i * 8 + 2] = mod_sub(kMax, (a >> 26) | ((b & k7Bits) << 6)); + out->c[i * 8 + 3] = mod_sub(kMax, (b >> 7) & k13Bits); + out->c[i * 8 + 4] = mod_sub(kMax, (b >> 20) | ((c & 1) << 12)); + out->c[i * 8 + 5] = mod_sub(kMax, (c >> 1) & k13Bits); + out->c[i * 8 + 6] = mod_sub(kMax, (c >> 14) & k13Bits); + out->c[i * 8 + 7] = mod_sub(kMax, (c >> 27) | ((uint32_t)d) << 5); + } +} + +// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 20 and b = +// 2^19. +static void scalar_decode_signed_20_19(scalar *out, const uint8_t in[640]) { + static const uint32_t kMax = 1u << 19; + static const uint32_t k20Bits = (1u << 20) - 1; + + uint32_t a, b; + uint16_t c; + static_assert(DEGREE % 4 == 0, "DEGREE must be a multiple of 4"); + for (int i = 0; i < DEGREE / 4; i++) { + OPENSSL_memcpy(&a, &in[10 * i], sizeof(a)); + OPENSSL_memcpy(&b, &in[10 * i + 4], sizeof(b)); + OPENSSL_memcpy(&c, &in[10 * i + 8], sizeof(c)); + + // It's not possible for a 20-bit number to be out of range when the max is + // 2^19. + out->c[i * 4] = mod_sub(kMax, a & k20Bits); + out->c[i * 4 + 1] = mod_sub(kMax, (a >> 20) | ((b & 0xff) << 12)); + out->c[i * 4 + 2] = mod_sub(kMax, (b >> 8) & k20Bits); + out->c[i * 4 + 3] = mod_sub(kMax, (b >> 28) | ((uint32_t)c) << 4); + } +} + +// FIPS 204, Algorithm 19 (`BitUnpack`). +static int scalar_decode_signed(scalar *out, const uint8_t *in, int bits, + uint32_t max) { + if (bits == 4) { + assert(max == ETA); + return scalar_decode_signed_4_eta(out, in); + } else if (bits == 13) { + assert(max == (1u << 12)); + scalar_decode_signed_13_12(out, in); + return 1; + } else if (bits == 20) { + assert(max == (1u << 19)); + scalar_decode_signed_20_19(out, in); + return 1; + } else { + abort(); + } +} + +/* Expansion functions */ + +// FIPS 204, Algorithm 30 (`RejNTTPoly`). +// +// Rejection samples a Keccak stream to get uniformly distributed elements. This +// is used for matrix expansion and only operates on public inputs. +static void scalar_from_keccak_vartime( + scalar *out, const uint8_t derived_seed[RHO_BYTES + 2]) { + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake128); + BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, RHO_BYTES + 2); + assert(keccak_ctx.squeeze_offset == 0); + assert(keccak_ctx.rate_bytes == 168); + static_assert(168 % 3 == 0, "block and coefficient boundaries do not align"); + + int done = 0; + while (done < DEGREE) { + uint8_t block[168]; + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + for (size_t i = 0; i < sizeof(block) && done < DEGREE; i += 3) { + // FIPS 204, Algorithm 14 (`CoeffFromThreeBytes`). + uint32_t value = (uint32_t)block[i] | ((uint32_t)block[i + 1] << 8) | + (((uint32_t)block[i + 2] & 0x7f) << 16); + if (value < kPrime) { + out->c[done++] = value; + } + } + } +} + +// FIPS 204, Algorithm 31 (`RejBoundedPoly`). +static void scalar_uniform_eta_4(scalar *out, + const uint8_t derived_seed[SIGMA_BYTES + 2]) { + static_assert(ETA == 4, "This implementation is specialized for ETA == 4"); + + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, SIGMA_BYTES + 2); + assert(keccak_ctx.squeeze_offset == 0); + assert(keccak_ctx.rate_bytes == 136); + + int done = 0; + while (done < DEGREE) { + uint8_t block[136]; + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + for (size_t i = 0; i < sizeof(block) && done < DEGREE; ++i) { + uint32_t t0 = block[i] & 0x0F; + uint32_t t1 = block[i] >> 4; + // FIPS 204, Algorithm 15 (`CoefFromHalfByte`). Although both the input + // and output here are secret, it is OK to leak when we rejected a byte. + // Individual bytes of the SHAKE-256 stream are (indistiguishable from) + // independent of each other and the original seed, so leaking information + // about the rejected bytes does not reveal the input or output. + if (constant_time_declassify_int(t0 < 9)) { + out->c[done++] = mod_sub(ETA, t0); + } + if (done < DEGREE && constant_time_declassify_int(t1 < 9)) { + out->c[done++] = mod_sub(ETA, t1); + } + } + } +} + +// FIPS 204, Algorithm 34 (`ExpandMask`), but just a single step. +static void scalar_sample_mask( + scalar *out, const uint8_t derived_seed[RHO_PRIME_BYTES + 2]) { + uint8_t buf[640]; + BORINGSSL_keccak(buf, sizeof(buf), derived_seed, RHO_PRIME_BYTES + 2, + boringssl_shake256); + + scalar_decode_signed_20_19(out, buf); +} + +// FIPS 204, Algorithm 29 (`SampleInBall`). +static void scalar_sample_in_ball_vartime(scalar *out, const uint8_t *seed, + int len) { + assert(len == 2 * LAMBDA_BYTES); + + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, seed, len); + assert(keccak_ctx.squeeze_offset == 0); + assert(keccak_ctx.rate_bytes == 136); + + uint8_t block[136]; + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + + uint64_t signs = CRYPTO_load_u64_le(block); + int offset = 8; + // SampleInBall implements a Fisher–Yates shuffle, which unavoidably leaks + // where the zeros are by memory access pattern. Although this leak happens + // before bad signatures are rejected, this is safe. See + // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/8d8f01ac_70af3f21/ + CONSTTIME_DECLASSIFY(block + offset, sizeof(block) - offset); + + OPENSSL_memset(out, 0, sizeof(*out)); + for (size_t i = DEGREE - TAU; i < DEGREE; i++) { + size_t byte; + for (;;) { + if (offset == 136) { + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + // See above. + CONSTTIME_DECLASSIFY(block, sizeof(block)); + offset = 0; + } + + byte = block[offset++]; + if (byte <= i) { + break; + } + } + + out->c[i] = out->c[byte]; + out->c[byte] = mod_sub(1, 2 * (signs & 1)); + signs >>= 1; + } +} + +// FIPS 204, Algorithm 32 (`ExpandA`). +static void matrix_expand(matrix *out, const uint8_t rho[RHO_BYTES]) { + static_assert(K <= 0x100, "K must fit in 8 bits"); + static_assert(L <= 0x100, "L must fit in 8 bits"); + + uint8_t derived_seed[RHO_BYTES + 2]; + OPENSSL_memcpy(derived_seed, rho, RHO_BYTES); + for (int i = 0; i < K; i++) { + for (int j = 0; j < L; j++) { + derived_seed[RHO_BYTES + 1] = (uint8_t)i; + derived_seed[RHO_BYTES] = (uint8_t)j; + scalar_from_keccak_vartime(&out->v[i][j], derived_seed); + } + } +} + +// FIPS 204, Algorithm 33 (`ExpandS`). +static void vector_expand_short(vectorl *s1, vectork *s2, + const uint8_t sigma[SIGMA_BYTES]) { + static_assert(K <= 0x100, "K must fit in 8 bits"); + static_assert(L <= 0x100, "L must fit in 8 bits"); + static_assert(K + L <= 0x100, "K+L must fit in 8 bits"); + + uint8_t derived_seed[SIGMA_BYTES + 2]; + OPENSSL_memcpy(derived_seed, sigma, SIGMA_BYTES); + derived_seed[SIGMA_BYTES] = 0; + derived_seed[SIGMA_BYTES + 1] = 0; + for (int i = 0; i < L; i++) { + scalar_uniform_eta_4(&s1->v[i], derived_seed); + ++derived_seed[SIGMA_BYTES]; + } + for (int i = 0; i < K; i++) { + scalar_uniform_eta_4(&s2->v[i], derived_seed); + ++derived_seed[SIGMA_BYTES]; + } +} + +// FIPS 204, Algorithm 34 (`ExpandMask`). +static void vectorl_expand_mask(vectorl *out, + const uint8_t seed[RHO_PRIME_BYTES], + size_t kappa) { + assert(kappa + L <= 0x10000); + + uint8_t derived_seed[RHO_PRIME_BYTES + 2]; + OPENSSL_memcpy(derived_seed, seed, RHO_PRIME_BYTES); + for (int i = 0; i < L; i++) { + size_t index = kappa + i; + derived_seed[RHO_PRIME_BYTES] = index & 0xFF; + derived_seed[RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF; + scalar_sample_mask(&out->v[i], derived_seed); + } +} + +/* Encoding */ + +// FIPS 204, Algorithm 16 (`SimpleBitPack`). +// +// Encodes an entire vector into 32*K*|bits| bytes. Note that since 256 (DEGREE) +// is divisible by 8, the individual vector entries will always fill a whole +// number of bytes, so we do not need to worry about bit packing here. +static void vectork_encode(uint8_t *out, const vectork *a, int bits) { + if (bits == 4) { + for (int i = 0; i < K; i++) { + scalar_encode_4(out + i * bits * DEGREE / 8, &a->v[i]); + } + } else { + assert(bits == 10); + for (int i = 0; i < K; i++) { + scalar_encode_10(out + i * bits * DEGREE / 8, &a->v[i]); + } + } +} + +// FIPS 204, Algorithm 18 (`SimpleBitUnpack`). +static void vectork_decode_10(vectork *out, const uint8_t *in) { + for (int i = 0; i < K; i++) { + scalar_decode_10(&out->v[i], in + i * 10 * DEGREE / 8); + } +} + +static void vectork_encode_signed(uint8_t *out, const vectork *a, int bits, + uint32_t max) { + for (int i = 0; i < K; i++) { + scalar_encode_signed(out + i * bits * DEGREE / 8, &a->v[i], bits, max); + } +} + +static int vectork_decode_signed(vectork *out, const uint8_t *in, int bits, + uint32_t max) { + for (int i = 0; i < K; i++) { + if (!scalar_decode_signed(&out->v[i], in + i * bits * DEGREE / 8, bits, + max)) { + return 0; + } + } + return 1; +} + +// FIPS 204, Algorithm 17 (`BitPack`). +// +// Encodes an entire vector into 32*L*|bits| bytes. Note that since 256 (DEGREE) +// is divisible by 8, the individual vector entries will always fill a whole +// number of bytes, so we do not need to worry about bit packing here. +static void vectorl_encode_signed(uint8_t *out, const vectorl *a, int bits, + uint32_t max) { + for (int i = 0; i < L; i++) { + scalar_encode_signed(out + i * bits * DEGREE / 8, &a->v[i], bits, max); + } +} + +static int vectorl_decode_signed(vectorl *out, const uint8_t *in, int bits, + uint32_t max) { + for (int i = 0; i < L; i++) { + if (!scalar_decode_signed(&out->v[i], in + i * bits * DEGREE / 8, bits, + max)) { + return 0; + } + } + return 1; +} + +// FIPS 204, Algorithm 28 (`w1Encode`). +static void w1_encode(uint8_t out[128 * K], const vectork *w1) { + vectork_encode(out, w1, 4); +} + +// FIPS 204, Algorithm 20 (`HintBitPack`). +static void hint_bit_pack(uint8_t out[OMEGA + K], const vectork *h) { + OPENSSL_memset(out, 0, OMEGA + K); + int index = 0; + for (int i = 0; i < K; i++) { + for (int j = 0; j < DEGREE; j++) { + if (h->v[i].c[j]) { + // h must have at most OMEGA non-zero coefficients. + BSSL_CHECK(index < OMEGA); + out[index++] = j; + } + } + out[OMEGA + i] = index; + } +} + +// FIPS 204, Algorithm 21 (`HintBitUnpack`). +static int hint_bit_unpack(vectork *h, const uint8_t in[OMEGA + K]) { + vectork_zero(h); + int index = 0; + for (int i = 0; i < K; i++) { + const int limit = in[OMEGA + i]; + if (limit < index || limit > OMEGA) { + return 0; + } + + int last = -1; + while (index < limit) { + int byte = in[index++]; + if (last >= 0 && byte <= last) { + return 0; + } + last = byte; + static_assert(DEGREE == 256, + "DEGREE must be 256 for this write to be in bounds"); + h->v[i].c[byte] = 1; + } + } + for (; index < OMEGA; index++) { + if (in[index] != 0) { + return 0; + } + } + return 1; +} + +struct public_key { + uint8_t rho[RHO_BYTES]; + vectork t1; + // Pre-cached value(s). + uint8_t public_key_hash[TR_BYTES]; +}; + +struct private_key { + uint8_t rho[RHO_BYTES]; + uint8_t k[K_BYTES]; + uint8_t public_key_hash[TR_BYTES]; + vectorl s1; + vectork s2; + vectork t0; +}; + +struct signature { + uint8_t c_tilde[2 * LAMBDA_BYTES]; + vectorl z; + vectork h; +}; + +// FIPS 204, Algorithm 22 (`pkEncode`). +static int mldsa_marshal_public_key(CBB *out, const struct public_key *pub) { + if (!CBB_add_bytes(out, pub->rho, sizeof(pub->rho))) { + return 0; + } + + uint8_t *vectork_output; + if (!CBB_add_space(out, &vectork_output, 320 * K)) { + return 0; + } + vectork_encode(vectork_output, &pub->t1, 10); + + return 1; +} + +// FIPS 204, Algorithm 23 (`pkDecode`). +static int mldsa_parse_public_key(struct public_key *pub, CBS *in) { + if (!CBS_copy_bytes(in, pub->rho, sizeof(pub->rho))) { + return 0; + } + + CBS t1_bytes; + if (!CBS_get_bytes(in, &t1_bytes, 320 * K)) { + return 0; + } + vectork_decode_10(&pub->t1, CBS_data(&t1_bytes)); + + return 1; +} + +// FIPS 204, Algorithm 24 (`skEncode`). +static int mldsa_marshal_private_key(CBB *out, const struct private_key *priv) { + if (!CBB_add_bytes(out, priv->rho, sizeof(priv->rho)) || + !CBB_add_bytes(out, priv->k, sizeof(priv->k)) || + !CBB_add_bytes(out, priv->public_key_hash, + sizeof(priv->public_key_hash))) { + return 0; + } + + uint8_t *vectorl_output; + if (!CBB_add_space(out, &vectorl_output, 128 * L)) { + return 0; + } + vectorl_encode_signed(vectorl_output, &priv->s1, 4, ETA); + + uint8_t *vectork_output; + if (!CBB_add_space(out, &vectork_output, 128 * K)) { + return 0; + } + vectork_encode_signed(vectork_output, &priv->s2, 4, ETA); + + if (!CBB_add_space(out, &vectork_output, 416 * K)) { + return 0; + } + vectork_encode_signed(vectork_output, &priv->t0, 13, 1 << 12); + + return 1; +} + +// FIPS 204, Algorithm 25 (`skDecode`). +static int mldsa_parse_private_key(struct private_key *priv, CBS *in) { + CBS s1_bytes; + CBS s2_bytes; + CBS t0_bytes; + if (!CBS_copy_bytes(in, priv->rho, sizeof(priv->rho)) || + !CBS_copy_bytes(in, priv->k, sizeof(priv->k)) || + !CBS_copy_bytes(in, priv->public_key_hash, + sizeof(priv->public_key_hash)) || + !CBS_get_bytes(in, &s1_bytes, 128 * L) || + !vectorl_decode_signed(&priv->s1, CBS_data(&s1_bytes), 4, ETA) || + !CBS_get_bytes(in, &s2_bytes, 128 * K) || + !vectork_decode_signed(&priv->s2, CBS_data(&s2_bytes), 4, ETA) || + !CBS_get_bytes(in, &t0_bytes, 416 * K) || + // Note: Decoding 13 bits into (-2^12, 2^12] cannot fail. + !vectork_decode_signed(&priv->t0, CBS_data(&t0_bytes), 13, 1 << 12)) { + return 0; + } + + return 1; +} + +// FIPS 204, Algorithm 26 (`sigEncode`). +static int mldsa_marshal_signature(CBB *out, const struct signature *sign) { + if (!CBB_add_bytes(out, sign->c_tilde, sizeof(sign->c_tilde))) { + return 0; + } + + uint8_t *vectorl_output; + if (!CBB_add_space(out, &vectorl_output, 640 * L)) { + return 0; + } + vectorl_encode_signed(vectorl_output, &sign->z, 20, 1 << 19); + + uint8_t *hint_output; + if (!CBB_add_space(out, &hint_output, OMEGA + K)) { + return 0; + } + hint_bit_pack(hint_output, &sign->h); + + return 1; +} + +// FIPS 204, Algorithm 27 (`sigDecode`). +static int mldsa_parse_signature(struct signature *sign, CBS *in) { + CBS z_bytes; + CBS hint_bytes; + if (!CBS_copy_bytes(in, sign->c_tilde, sizeof(sign->c_tilde)) || + !CBS_get_bytes(in, &z_bytes, 640 * L) || + // Note: Decoding 20 bits into (-2^19, 2^19] cannot fail. + !vectorl_decode_signed(&sign->z, CBS_data(&z_bytes), 20, 1 << 19) || + !CBS_get_bytes(in, &hint_bytes, OMEGA + K) || + !hint_bit_unpack(&sign->h, CBS_data(&hint_bytes))) { + return 0; + }; + + return 1; +} + +static struct private_key *private_key_from_external( + const struct MLDSA65_private_key *external) { + static_assert( + sizeof(struct MLDSA65_private_key) == sizeof(struct private_key), + "Kyber private key size incorrect"); + static_assert( + alignof(struct MLDSA65_private_key) == alignof(struct private_key), + "Kyber private key align incorrect"); + return (struct private_key *)external; +} + +static struct public_key *public_key_from_external( + const struct MLDSA65_public_key *external) { + static_assert(sizeof(struct MLDSA65_public_key) == sizeof(struct public_key), + "mldsa public key size incorrect"); + static_assert( + alignof(struct MLDSA65_public_key) == alignof(struct public_key), + "mldsa public key align incorrect"); + return (struct public_key *)external; +} + +/* API */ + +// Calls |MLDSA_generate_key_external_entropy| with random bytes from +// |RAND_bytes|. Returns 1 on success and 0 on failure. +int MLDSA65_generate_key( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], + struct MLDSA65_private_key *out_private_key) { + RAND_bytes(out_seed, MLDSA_SEED_BYTES); + return MLDSA65_generate_key_external_entropy(out_encoded_public_key, + out_private_key, out_seed); +} + +int MLDSA65_private_key_from_seed(struct MLDSA65_private_key *out_private_key, + const uint8_t *seed, size_t seed_len) { + if (seed_len != MLDSA_SEED_BYTES) { + return 0; + } + uint8_t public_key[MLDSA65_PUBLIC_KEY_BYTES]; + return MLDSA65_generate_key_external_entropy(public_key, out_private_key, + seed); +} + +// FIPS 204, Algorithm 6 (`ML-DSA.KeyGen_internal`). Returns 1 on success and 0 +// on failure. +int MLDSA65_generate_key_external_entropy( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + struct MLDSA65_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]) { + int ret = 0; + + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct values_st { + struct public_key pub; + matrix a_ntt; + vectorl s1_ntt; + vectork t; + }; + struct values_st *values = OPENSSL_malloc(sizeof(*values)); + if (values == NULL) { + goto err; + } + + struct private_key *priv = private_key_from_external(out_private_key); + + uint8_t augmented_entropy[MLDSA_SEED_BYTES + 2]; + OPENSSL_memcpy(augmented_entropy, entropy, MLDSA_SEED_BYTES); + // The k and l parameters are appended to the seed. + augmented_entropy[MLDSA_SEED_BYTES] = K; + augmented_entropy[MLDSA_SEED_BYTES + 1] = L; + uint8_t expanded_seed[RHO_BYTES + SIGMA_BYTES + K_BYTES]; + BORINGSSL_keccak(expanded_seed, sizeof(expanded_seed), augmented_entropy, + sizeof(augmented_entropy), boringssl_shake256); + const uint8_t *const rho = expanded_seed; + const uint8_t *const sigma = expanded_seed + RHO_BYTES; + const uint8_t *const k = expanded_seed + RHO_BYTES + SIGMA_BYTES; + // rho is public. + CONSTTIME_DECLASSIFY(rho, RHO_BYTES); + OPENSSL_memcpy(values->pub.rho, rho, sizeof(values->pub.rho)); + OPENSSL_memcpy(priv->rho, rho, sizeof(priv->rho)); + OPENSSL_memcpy(priv->k, k, sizeof(priv->k)); + + matrix_expand(&values->a_ntt, rho); + vector_expand_short(&priv->s1, &priv->s2, sigma); + + OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt)); + vectorl_ntt(&values->s1_ntt); + + matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt); + vectork_inverse_ntt(&values->t); + vectork_add(&values->t, &values->t, &priv->s2); + + vectork_power2_round(&values->pub.t1, &priv->t0, &values->t); + // t1 is public. + CONSTTIME_DECLASSIFY(&values->pub.t1, sizeof(values->pub.t1)); + + CBB cbb; + CBB_init_fixed(&cbb, out_encoded_public_key, MLDSA65_PUBLIC_KEY_BYTES); + if (!mldsa_marshal_public_key(&cbb, &values->pub)) { + goto err; + } + assert(CBB_len(&cbb) == MLDSA65_PUBLIC_KEY_BYTES); + + BORINGSSL_keccak(priv->public_key_hash, sizeof(priv->public_key_hash), + out_encoded_public_key, MLDSA65_PUBLIC_KEY_BYTES, + boringssl_shake256); + + ret = 1; +err: + OPENSSL_free(values); + return ret; +} + +int MLDSA65_public_from_private(struct MLDSA65_public_key *out_public_key, + const struct MLDSA65_private_key *private_key) { + int ret = 0; + + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct values_st { + matrix a_ntt; + vectorl s1_ntt; + vectork t; + vectork t0; + }; + struct values_st *values = OPENSSL_malloc(sizeof(*values)); + if (values == NULL) { + goto err; + } + + const struct private_key *priv = private_key_from_external(private_key); + struct public_key *pub = public_key_from_external(out_public_key); + + OPENSSL_memcpy(pub->rho, priv->rho, sizeof(pub->rho)); + OPENSSL_memcpy(pub->public_key_hash, priv->public_key_hash, + sizeof(pub->public_key_hash)); + + matrix_expand(&values->a_ntt, priv->rho); + + OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt)); + vectorl_ntt(&values->s1_ntt); + + matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt); + vectork_inverse_ntt(&values->t); + vectork_add(&values->t, &values->t, &priv->s2); + + vectork_power2_round(&pub->t1, &values->t0, &values->t); + + ret = 1; +err: + OPENSSL_free(values); + return ret; +} + +// FIPS 204, Algorithm 7 (`ML-DSA.Sign_internal`). Returns 1 on success and 0 on +// failure. +int MLDSA65_sign_internal( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const struct MLDSA65_private_key *private_key, const uint8_t *msg, + size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len, + const uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + int ret = 0; + const struct private_key *priv = private_key_from_external(private_key); + + uint8_t mu[MU_BYTES]; + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, priv->public_key_hash, + sizeof(priv->public_key_hash)); + BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len); + BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len); + BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len); + BORINGSSL_keccak_squeeze(&keccak_ctx, mu, MU_BYTES); + + uint8_t rho_prime[RHO_PRIME_BYTES]; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, priv->k, sizeof(priv->k)); + BORINGSSL_keccak_absorb(&keccak_ctx, randomizer, + MLDSA_SIGNATURE_RANDOMIZER_BYTES); + BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES); + BORINGSSL_keccak_squeeze(&keccak_ctx, rho_prime, RHO_PRIME_BYTES); + + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct values_st { + struct signature sign; + vectorl s1_ntt; + vectork s2_ntt; + vectork t0_ntt; + matrix a_ntt; + vectorl y; + vectork w; + vectork w1; + vectorl cs1; + vectork cs2; + }; + struct values_st *values = OPENSSL_malloc(sizeof(*values)); + if (values == NULL) { + goto err; + } + OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt)); + vectorl_ntt(&values->s1_ntt); + + OPENSSL_memcpy(&values->s2_ntt, &priv->s2, sizeof(values->s2_ntt)); + vectork_ntt(&values->s2_ntt); + + OPENSSL_memcpy(&values->t0_ntt, &priv->t0, sizeof(values->t0_ntt)); + vectork_ntt(&values->t0_ntt); + + matrix_expand(&values->a_ntt, priv->rho); + + // kappa must not exceed 2**16/L = 13107. But the probability of it exceeding + // even 1000 iterations is vanishingly small. + for (size_t kappa = 0;; kappa += L) { + vectorl_expand_mask(&values->y, rho_prime, kappa); + + vectorl *y_ntt = &values->cs1; + OPENSSL_memcpy(y_ntt, &values->y, sizeof(*y_ntt)); + vectorl_ntt(y_ntt); + + matrix_mult(&values->w, &values->a_ntt, y_ntt); + vectork_inverse_ntt(&values->w); + + vectork_high_bits(&values->w1, &values->w); + uint8_t w1_encoded[128 * K]; + w1_encode(w1_encoded, &values->w1); + + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES); + BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K); + BORINGSSL_keccak_squeeze(&keccak_ctx, values->sign.c_tilde, + 2 * LAMBDA_BYTES); + + scalar c_ntt; + scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde, + sizeof(values->sign.c_tilde)); + scalar_ntt(&c_ntt); + + vectorl_mult_scalar(&values->cs1, &values->s1_ntt, &c_ntt); + vectorl_inverse_ntt(&values->cs1); + vectork_mult_scalar(&values->cs2, &values->s2_ntt, &c_ntt); + vectork_inverse_ntt(&values->cs2); + + vectorl_add(&values->sign.z, &values->y, &values->cs1); + + vectork *r0 = &values->w1; + vectork_sub(r0, &values->w, &values->cs2); + vectork_low_bits(r0, r0); + + // Leaking the fact that a signature was rejected is fine as the next + // attempt at a signature will be (indistinguishable from) independent of + // this one. Note, however, that we additionally leak which of the two + // branches rejected the signature. Section 5.5 of + // https://pq-crystals.org/dilithium/data/dilithium-specification-round3.pdf + // describes this leak as OK. Note we leak less than what is described by + // the paper; we do not reveal which coefficient violated the bound, and we + // hide which of the |z_max| or |r0_max| bound failed. See also + // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/2bbab0fa_d241d35a/ + uint32_t z_max = vectorl_max(&values->sign.z); + uint32_t r0_max = vectork_max_signed(r0); + if (constant_time_declassify_w( + constant_time_ge_w(z_max, kGamma1 - BETA) | + constant_time_ge_w(r0_max, kGamma2 - BETA))) { + continue; + } + + vectork *ct0 = &values->w1; + vectork_mult_scalar(ct0, &values->t0_ntt, &c_ntt); + vectork_inverse_ntt(ct0); + vectork_make_hint(&values->sign.h, ct0, &values->cs2, &values->w); + + // See above. + uint32_t ct0_max = vectork_max(ct0); + size_t h_ones = vectork_count_ones(&values->sign.h); + if (constant_time_declassify_w(constant_time_ge_w(ct0_max, kGamma2) | + constant_time_lt_w(OMEGA, h_ones))) { + continue; + } + + // Although computed with the private key, the signature is public. + CONSTTIME_DECLASSIFY(values->sign.c_tilde, sizeof(values->sign.c_tilde)); + CONSTTIME_DECLASSIFY(&values->sign.z, sizeof(values->sign.z)); + CONSTTIME_DECLASSIFY(&values->sign.h, sizeof(values->sign.h)); + + CBB cbb; + CBB_init_fixed(&cbb, out_encoded_signature, MLDSA65_SIGNATURE_BYTES); + if (!mldsa_marshal_signature(&cbb, &values->sign)) { + goto err; + } + + BSSL_CHECK(CBB_len(&cbb) == MLDSA65_SIGNATURE_BYTES); + ret = 1; + break; + } + +err: + OPENSSL_free(values); + return ret; +} + +// mldsa signature in randomized mode, filling the random bytes with +// |RAND_bytes|. Returns 1 on success and 0 on failure. +int MLDSA65_sign(uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const struct MLDSA65_private_key *private_key, + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > 255) { + return 0; + } + + uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES]; + RAND_bytes(randomizer, sizeof(randomizer)); + + const uint8_t context_prefix[2] = {0, context_len}; + return MLDSA65_sign_internal(out_encoded_signature, private_key, msg, msg_len, + context_prefix, sizeof(context_prefix), context, + context_len, randomizer); +} + +// FIPS 204, Algorithm 3 (`ML-DSA.Verify`). +int MLDSA65_verify(const struct MLDSA65_public_key *public_key, + const uint8_t *signature, size_t signature_len, + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > 255 || signature_len != MLDSA65_SIGNATURE_BYTES) { + return 0; + } + + const uint8_t context_prefix[2] = {0, context_len}; + return MLDSA65_verify_internal(public_key, signature, msg, msg_len, + context_prefix, sizeof(context_prefix), + context, context_len); +} + +// FIPS 204, Algorithm 8 (`ML-DSA.Verify_internal`). +int MLDSA65_verify_internal( + const struct MLDSA65_public_key *public_key, + const uint8_t encoded_signature[MLDSA65_SIGNATURE_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix, + size_t context_prefix_len, const uint8_t *context, size_t context_len) { + int ret = 0; + + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct values_st { + struct signature sign; + matrix a_ntt; + vectorl z_ntt; + vectork az_ntt; + vectork ct1_ntt; + }; + struct values_st *values = OPENSSL_malloc(sizeof(*values)); + if (values == NULL) { + goto err; + } + + const struct public_key *pub = public_key_from_external(public_key); + + CBS cbs; + CBS_init(&cbs, encoded_signature, MLDSA65_SIGNATURE_BYTES); + if (!mldsa_parse_signature(&values->sign, &cbs)) { + goto err; + } + + matrix_expand(&values->a_ntt, pub->rho); + + uint8_t mu[MU_BYTES]; + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, pub->public_key_hash, + sizeof(pub->public_key_hash)); + BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len); + BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len); + BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len); + BORINGSSL_keccak_squeeze(&keccak_ctx, mu, MU_BYTES); + + scalar c_ntt; + scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde, + sizeof(values->sign.c_tilde)); + scalar_ntt(&c_ntt); + + OPENSSL_memcpy(&values->z_ntt, &values->sign.z, sizeof(values->z_ntt)); + vectorl_ntt(&values->z_ntt); + + matrix_mult(&values->az_ntt, &values->a_ntt, &values->z_ntt); + + vectork_scale_power2_round(&values->ct1_ntt, &pub->t1); + vectork_ntt(&values->ct1_ntt); + + vectork_mult_scalar(&values->ct1_ntt, &values->ct1_ntt, &c_ntt); + + vectork *const w1 = &values->az_ntt; + vectork_sub(w1, &values->az_ntt, &values->ct1_ntt); + vectork_inverse_ntt(w1); + + vectork_use_hint_vartime(w1, &values->sign.h, w1); + uint8_t w1_encoded[128 * K]; + w1_encode(w1_encoded, w1); + + uint8_t c_tilde[2 * LAMBDA_BYTES]; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES); + BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K); + BORINGSSL_keccak_squeeze(&keccak_ctx, c_tilde, 2 * LAMBDA_BYTES); + + uint32_t z_max = vectorl_max(&values->sign.z); + if (z_max < kGamma1 - BETA && + OPENSSL_memcmp(c_tilde, values->sign.c_tilde, 2 * LAMBDA_BYTES) == 0) { + ret = 1; + } + +err: + OPENSSL_free(values); + return ret; +} + +/* Serialization of keys. */ + +int MLDSA65_marshal_public_key(CBB *out, + const struct MLDSA65_public_key *public_key) { + return mldsa_marshal_public_key(out, public_key_from_external(public_key)); +} + +int MLDSA65_parse_public_key(struct MLDSA65_public_key *public_key, CBS *in) { + struct public_key *pub = public_key_from_external(public_key); + CBS orig_in = *in; + if (!mldsa_parse_public_key(pub, in) || CBS_len(in) != 0) { + return 0; + } + + // Compute pre-cached values. + BORINGSSL_keccak(pub->public_key_hash, sizeof(pub->public_key_hash), + CBS_data(&orig_in), CBS_len(&orig_in), boringssl_shake256); + return 1; +} + +int MLDSA65_marshal_private_key(CBB *out, + const struct MLDSA65_private_key *private_key) { + return mldsa_marshal_private_key(out, private_key_from_external(private_key)); +} + +int MLDSA65_parse_private_key(struct MLDSA65_private_key *private_key, + CBS *in) { + struct private_key *priv = private_key_from_external(private_key); + return mldsa_parse_private_key(priv, in) && CBS_len(in) == 0; +} diff --git a/Sources/CCryptoBoringSSL/crypto/mlkem/internal.h b/Sources/CCryptoBoringSSL/crypto/mlkem/internal.h new file mode 100644 index 00000000..c3018798 --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/mlkem/internal.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2023, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_CRYPTO_MLKEM_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_MLKEM_INTERNAL_H + +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + + +// MLKEM_ENCAP_ENTROPY is the number of bytes of uniformly random entropy +// necessary to encapsulate a secret. The entropy will be leaked to the +// decapsulating party. +#define MLKEM_ENCAP_ENTROPY 32 + +// MLKEM768_generate_key_external_seed is a deterministic function to create a +// pair of ML-KEM-768 keys, using the supplied seed. The seed needs to be +// uniformly random. This function is should only be used for tests, regular +// callers should use the non-deterministic |MLKEM768_generate_key| directly. +OPENSSL_EXPORT void MLKEM768_generate_key_external_seed( + uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES], + struct MLKEM768_private_key *out_private_key, + const uint8_t seed[MLKEM_SEED_BYTES]); + +// MLKEM768_encap_external_entropy behaves like |MLKEM768_encap|, but uses +// |MLKEM_ENCAP_ENTROPY| bytes of |entropy| for randomization. The decapsulating +// side will be able to recover |entropy| in full. This function should only be +// used for tests, regular callers should use the non-deterministic +// |MLKEM768_encap| directly. +OPENSSL_EXPORT void MLKEM768_encap_external_entropy( + uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const struct MLKEM768_public_key *public_key, + const uint8_t entropy[MLKEM_ENCAP_ENTROPY]); + +// MLKEM768_marshal_private_key serializes |private_key| to |out| in the +// NIST format for ML-KEM-768 private keys. It returns one on success or +// zero on allocation error. (Note that one can also save just the seed value +// produced by |MLKEM768_generate_key|, which is significantly smaller.) +OPENSSL_EXPORT int MLKEM768_marshal_private_key( + CBB *out, const struct MLKEM768_private_key *private_key); + +// MLKEM1024_generate_key_external_seed is a deterministic function to create a +// pair of ML-KEM-1024 keys, using the supplied seed. The seed needs to be +// uniformly random. This function is should only be used for tests, regular +// callers should use the non-deterministic |MLKEM1024_generate_key| directly. +OPENSSL_EXPORT void MLKEM1024_generate_key_external_seed( + uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES], + struct MLKEM1024_private_key *out_private_key, + const uint8_t seed[MLKEM_SEED_BYTES]); + +// MLKEM1024_encap_external_entropy behaves like |MLKEM1024_encap|, but uses +// |MLKEM_ENCAP_ENTROPY| bytes of |entropy| for randomization. The +// decapsulating side will be able to recover |entropy| in full. This function +// should only be used for tests, regular callers should use the +// non-deterministic |MLKEM1024_encap| directly. +OPENSSL_EXPORT void MLKEM1024_encap_external_entropy( + uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const struct MLKEM1024_public_key *public_key, + const uint8_t entropy[MLKEM_ENCAP_ENTROPY]); + +// MLKEM1024_marshal_private_key serializes |private_key| to |out| in the +// NIST format for ML-KEM-1024 private keys. It returns one on success or +// zero on allocation error. (Note that one can also save just the seed value +// produced by |MLKEM1024_generate_key|, which is significantly smaller.) +OPENSSL_EXPORT int MLKEM1024_marshal_private_key( + CBB *out, const struct MLKEM1024_private_key *private_key); + + +#if defined(__cplusplus) +} +#endif + +#endif // OPENSSL_HEADER_CRYPTO_MLKEM_INTERNAL_H diff --git a/Sources/CCryptoBoringSSL/crypto/obj/obj_dat.h b/Sources/CCryptoBoringSSL/crypto/obj/obj_dat.h index 71ef2d2b..f1b70639 100644 --- a/Sources/CCryptoBoringSSL/crypto/obj/obj_dat.h +++ b/Sources/CCryptoBoringSSL/crypto/obj/obj_dat.h @@ -57,7 +57,7 @@ /* This file is generated by crypto/obj/objects.go. */ -#define NUM_NID 965 +#define NUM_NID 966 static const uint8_t kObjectData[] = { /* NID_rsadsi */ @@ -8783,6 +8783,7 @@ static const ASN1_OBJECT kObjects[NUM_NID] = { {"HKDF", "hkdf", NID_hkdf, 0, NULL, 0}, {"X25519Kyber768Draft00", "X25519Kyber768Draft00", NID_X25519Kyber768Draft00, 0, NULL, 0}, + {"X25519MLKEM768", "X25519MLKEM768", NID_X25519MLKEM768, 0, NULL, 0}, }; static const uint16_t kNIDsInShortNameOrder[] = { @@ -8981,6 +8982,7 @@ static const uint16_t kNIDsInShortNameOrder[] = { 458 /* UID */, 948 /* X25519 */, 964 /* X25519Kyber768Draft00 */, + 965 /* X25519MLKEM768 */, 961 /* X448 */, 11 /* X500 */, 378 /* X500algorithms */, @@ -9852,6 +9854,7 @@ static const uint16_t kNIDsInLongNameOrder[] = { 375 /* Trust Root */, 948 /* X25519 */, 964 /* X25519Kyber768Draft00 */, + 965 /* X25519MLKEM768 */, 961 /* X448 */, 12 /* X509 */, 402 /* X509v3 AC Targeting */, diff --git a/Sources/CCryptoBoringSSL/crypto/pem/pem_lib.c b/Sources/CCryptoBoringSSL/crypto/pem/pem_lib.c index 34806378..b291782e 100644 --- a/Sources/CCryptoBoringSSL/crypto/pem/pem_lib.c +++ b/Sources/CCryptoBoringSSL/crypto/pem/pem_lib.c @@ -312,12 +312,11 @@ int PEM_ASN1_write_bio(i2d_of_void *i2d, const char *name, BIO *bp, void *x, const unsigned iv_len = EVP_CIPHER_iv_length(enc); if (pass == NULL) { - pass_len = 0; if (!callback) { callback = PEM_def_callback; } pass_len = (*callback)(buf, PEM_BUFSIZE, 1, u); - if (pass_len <= 0) { + if (pass_len < 0) { OPENSSL_PUT_ERROR(PEM, PEM_R_READ_KEY); goto err; } @@ -393,7 +392,7 @@ int PEM_do_header(EVP_CIPHER_INFO *cipher, unsigned char *data, long *plen, callback = PEM_def_callback; } pass_len = callback(buf, PEM_BUFSIZE, 0, u); - if (pass_len <= 0) { + if (pass_len < 0) { OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ); return 0; } @@ -779,11 +778,11 @@ int PEM_read_bio(BIO *bp, char **name, char **header, unsigned char **data, int PEM_def_callback(char *buf, int size, int rwflag, void *userdata) { if (!buf || !userdata || size < 0) { - return 0; + return -1; } size_t len = strlen((char *)userdata); if (len >= (size_t)size) { - return 0; + return -1; } OPENSSL_strlcpy(buf, userdata, (size_t)size); return (int)len; diff --git a/Sources/CCryptoBoringSSL/crypto/pem/pem_pk8.c b/Sources/CCryptoBoringSSL/crypto/pem/pem_pk8.c index aed4905f..415b45bd 100644 --- a/Sources/CCryptoBoringSSL/crypto/pem/pem_pk8.c +++ b/Sources/CCryptoBoringSSL/crypto/pem/pem_pk8.c @@ -113,12 +113,11 @@ static int do_pk8pkey(BIO *bp, const EVP_PKEY *x, int isder, int nid, } if (enc || (nid != -1)) { if (!pass) { - pass_len = 0; if (!cb) { cb = PEM_def_callback; } pass_len = cb(buf, PEM_BUFSIZE, 1, u); - if (pass_len <= 0) { + if (pass_len < 0) { OPENSSL_PUT_ERROR(PEM, PEM_R_READ_KEY); PKCS8_PRIV_KEY_INFO_free(p8inf); return 0; @@ -166,7 +165,7 @@ EVP_PKEY *d2i_PKCS8PrivateKey_bio(BIO *bp, EVP_PKEY **x, pem_password_cb *cb, cb = PEM_def_callback; } pass_len = cb(psbuf, PEM_BUFSIZE, 0, u); - if (pass_len <= 0) { + if (pass_len < 0) { OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ); X509_SIG_free(p8); return NULL; diff --git a/Sources/CCryptoBoringSSL/crypto/pem/pem_pkey.c b/Sources/CCryptoBoringSSL/crypto/pem/pem_pkey.c index 225a9e0d..d34d9359 100644 --- a/Sources/CCryptoBoringSSL/crypto/pem/pem_pkey.c +++ b/Sources/CCryptoBoringSSL/crypto/pem/pem_pkey.c @@ -110,7 +110,7 @@ EVP_PKEY *PEM_read_bio_PrivateKey(BIO *bp, EVP_PKEY **x, pem_password_cb *cb, cb = PEM_def_callback; } pass_len = cb(psbuf, PEM_BUFSIZE, 0, u); - if (pass_len <= 0) { + if (pass_len < 0) { OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ); X509_SIG_free(p8); goto err; diff --git a/Sources/CCryptoBoringSSL/crypto/pkcs8/internal.h b/Sources/CCryptoBoringSSL/crypto/pkcs8/internal.h index 853e830f..20b62e4f 100644 --- a/Sources/CCryptoBoringSSL/crypto/pkcs8/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/pkcs8/internal.h @@ -57,6 +57,7 @@ #define OPENSSL_HEADER_PKCS8_INTERNAL_H #include +#include #if defined(__cplusplus) extern "C" { diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/deterministic.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/deterministic.c index 96b9b336..0c2c5a9d 100644 --- a/Sources/CCryptoBoringSSL/crypto/rand_extra/deterministic.c +++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/deterministic.c @@ -14,7 +14,8 @@ #include -#include "../fipsmodule/rand/internal.h" +#include "../bcm_support.h" +#include "sysrand_internal.h" #if defined(OPENSSL_RAND_DETERMINISTIC) @@ -35,6 +36,8 @@ static CRYPTO_MUTEX g_num_calls_lock = CRYPTO_MUTEX_INIT; void RAND_reset_for_fuzzing(void) { g_num_calls = 0; } +void CRYPTO_init_sysrand(void) {} + void CRYPTO_sysrand(uint8_t *out, size_t requested) { static const uint8_t kZeroKey[32]; @@ -50,6 +53,11 @@ void CRYPTO_sysrand(uint8_t *out, size_t requested) { CRYPTO_chacha_20(out, out, requested, kZeroKey, nonce, 0); } +int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) { + CRYPTO_sysrand(buf, len); + return 1; +} + void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) { CRYPTO_sysrand(out, requested); } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/fork_detect.c similarity index 81% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.c rename to Sources/CCryptoBoringSSL/crypto/rand_extra/fork_detect.c index b3e1506a..99be497a 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.c +++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/fork_detect.c @@ -16,8 +16,7 @@ #define _GNU_SOURCE // needed for madvise() and MAP_ANONYMOUS on Linux. #endif -#include -#include "fork_detect.h" +#include "../bcm_support.h" #if defined(OPENSSL_FORK_DETECTION_MADVISE) #include @@ -35,19 +34,18 @@ static_assert(MADV_WIPEONFORK == 18, "MADV_WIPEONFORK is not 18"); #include #endif // OPENSSL_FORK_DETECTION_MADVISE -#include "../delocate.h" -#include "../../internal.h" +#include "../internal.h" #if defined(OPENSSL_FORK_DETECTION_MADVISE) -DEFINE_BSS_GET(int, g_force_madv_wipeonfork); -DEFINE_BSS_GET(int, g_force_madv_wipeonfork_enabled); -DEFINE_STATIC_ONCE(g_fork_detect_once); -DEFINE_STATIC_MUTEX(g_fork_detect_lock); -DEFINE_BSS_GET(CRYPTO_atomic_u32 *, g_fork_detect_addr); -DEFINE_BSS_GET(uint64_t, g_fork_generation); +static int g_force_madv_wipeonfork; +static int g_force_madv_wipeonfork_enabled; +static CRYPTO_once_t g_fork_detect_once = CRYPTO_ONCE_INIT; +static CRYPTO_MUTEX g_fork_detect_lock = CRYPTO_MUTEX_INIT; +static CRYPTO_atomic_u32 * g_fork_detect_addr; +static uint64_t g_fork_generation; static void init_fork_detect(void) { - if (*g_force_madv_wipeonfork_bss_get()) { + if (g_force_madv_wipeonfork) { return; } @@ -74,13 +72,13 @@ static void init_fork_detect(void) { } CRYPTO_atomic_store_u32(addr, 1); - *g_fork_detect_addr_bss_get() = addr; - *g_fork_generation_bss_get() = 1; + g_fork_detect_addr = addr; + g_fork_generation = 1; } uint64_t CRYPTO_get_fork_generation(void) { - CRYPTO_once(g_fork_detect_once_bss_get(), init_fork_detect); + CRYPTO_once(&g_fork_detect_once, init_fork_detect); // In a single-threaded process, there are obviously no races because there's // only a single mutator in the address space. @@ -93,12 +91,12 @@ uint64_t CRYPTO_get_fork_generation(void) { // child process is single-threaded, the child may become multi-threaded // before it observes this. Therefore, we must synchronize the logic below. - CRYPTO_atomic_u32 *const flag_ptr = *g_fork_detect_addr_bss_get(); + CRYPTO_atomic_u32 *const flag_ptr = g_fork_detect_addr; if (flag_ptr == NULL) { // Our kernel is too old to support |MADV_WIPEONFORK| or // |g_force_madv_wipeonfork| is set. - if (*g_force_madv_wipeonfork_bss_get() && - *g_force_madv_wipeonfork_enabled_bss_get()) { + if (g_force_madv_wipeonfork && + g_force_madv_wipeonfork_enabled) { // A constant generation number to simulate support, even if the kernel // doesn't support it. return 42; @@ -114,7 +112,7 @@ uint64_t CRYPTO_get_fork_generation(void) { // In the common case, try to observe the flag without taking a lock. This // avoids cacheline contention in the PRNG. - uint64_t *const generation_ptr = g_fork_generation_bss_get(); + uint64_t *const generation_ptr = &g_fork_generation; if (CRYPTO_atomic_load_u32(flag_ptr) != 0) { // If we observe a non-zero flag, it is safe to read |generation_ptr| // without a lock. The flag and generation number are fixed for this copy of @@ -125,7 +123,7 @@ uint64_t CRYPTO_get_fork_generation(void) { // The flag was zero. The generation number must be incremented, but other // threads may have concurrently observed the zero, so take a lock before // incrementing. - CRYPTO_MUTEX *const lock = g_fork_detect_lock_bss_get(); + CRYPTO_MUTEX *const lock = &g_fork_detect_lock; CRYPTO_MUTEX_lock_write(lock); uint64_t current_generation = *generation_ptr; if (CRYPTO_atomic_load_u32(flag_ptr) == 0) { @@ -147,35 +145,35 @@ uint64_t CRYPTO_get_fork_generation(void) { } void CRYPTO_fork_detect_force_madv_wipeonfork_for_testing(int on) { - *g_force_madv_wipeonfork_bss_get() = 1; - *g_force_madv_wipeonfork_enabled_bss_get() = on; + g_force_madv_wipeonfork = 1; + g_force_madv_wipeonfork_enabled = on; } #elif defined(OPENSSL_FORK_DETECTION_PTHREAD_ATFORK) -DEFINE_STATIC_ONCE(g_pthread_fork_detection_once); -DEFINE_BSS_GET(uint64_t, g_atfork_fork_generation); +static CRYPTO_once_t g_pthread_fork_detection_once = CRYPTO_ONCE_INIT; +static uint64_t g_atfork_fork_generation; static void we_are_forked(void) { // Immediately after a fork, the process must be single-threaded. - uint64_t value = *g_atfork_fork_generation_bss_get() + 1; + uint64_t value = g_atfork_fork_generation + 1; if (value == 0) { value = 1; } - *g_atfork_fork_generation_bss_get() = value; + g_atfork_fork_generation = value; } static void init_pthread_fork_detection(void) { if (pthread_atfork(NULL, NULL, we_are_forked) != 0) { abort(); } - *g_atfork_fork_generation_bss_get() = 1; + g_atfork_fork_generation = 1; } uint64_t CRYPTO_get_fork_generation(void) { - CRYPTO_once(g_pthread_fork_detection_once_bss_get(), init_pthread_fork_detection); + CRYPTO_once(&g_pthread_fork_detection_once, init_pthread_fork_detection); - return *g_atfork_fork_generation_bss_get(); + return g_atfork_fork_generation; } #elif defined(OPENSSL_DOES_NOT_FORK) diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/getentropy.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/getentropy.c index ccab73e1..02ddc4d8 100644 --- a/Sources/CCryptoBoringSSL/crypto/rand_extra/getentropy.c +++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/getentropy.c @@ -18,7 +18,8 @@ #include -#include "../fipsmodule/rand/internal.h" +#include "../bcm_support.h" +#include "sysrand_internal.h" #if defined(OPENSSL_RAND_GETENTROPY) @@ -30,6 +31,8 @@ #include #endif +void CRYPTO_init_sysrand(void) {} + // CRYPTO_sysrand puts |requested| random bytes into |out|. void CRYPTO_sysrand(uint8_t *out, size_t requested) { while (requested > 0) { @@ -45,6 +48,11 @@ void CRYPTO_sysrand(uint8_t *out, size_t requested) { } } +int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) { + CRYPTO_sysrand(buf, len); + return 1; +} + void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) { CRYPTO_sysrand(out, requested); } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/getrandom_fillin.h b/Sources/CCryptoBoringSSL/crypto/rand_extra/getrandom_fillin.h similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/getrandom_fillin.h rename to Sources/CCryptoBoringSSL/crypto/rand_extra/getrandom_fillin.h diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/ios.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/ios.c index ca3b7127..33c4bf92 100644 --- a/Sources/CCryptoBoringSSL/crypto/rand_extra/ios.c +++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/ios.c @@ -14,19 +14,27 @@ #include -#include "../fipsmodule/rand/internal.h" +#include "../bcm_support.h" +#include "sysrand_internal.h" #if defined(OPENSSL_RAND_IOS) #include #include +void CRYPTO_init_sysrand(void) {} + void CRYPTO_sysrand(uint8_t *out, size_t requested) { if (CCRandomGenerateBytes(out, requested) != kCCSuccess) { abort(); } } +int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) { + CRYPTO_sysrand(buf, len); + return 1; +} + void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) { CRYPTO_sysrand(out, requested); } diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/passive.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/passive.c index fcaaa37b..e693ba1a 100644 --- a/Sources/CCryptoBoringSSL/crypto/rand_extra/passive.c +++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/passive.c @@ -14,11 +14,27 @@ #include -#include "../fipsmodule/rand/internal.h" +#include "../fipsmodule/bcm_interface.h" +#include "../bcm_support.h" #include "../internal.h" #if defined(BORINGSSL_FIPS) +// passive_get_seed_entropy writes |out_entropy_len| bytes of entropy, suitable +// for seeding a DRBG, to |out_entropy|. It sets |*out_used_cpu| to one if the +// entropy came directly from the CPU and zero if it came from the OS. It +// actively obtains entropy from the CPU/OS +static void passive_get_seed_entropy(uint8_t *out_entropy, + size_t out_entropy_len, + int *out_want_additional_input) { + *out_want_additional_input = 0; + if (bcm_success(BCM_rand_bytes_hwrng(out_entropy, out_entropy_len))) { + *out_want_additional_input = 1; + } else { + CRYPTO_sysrand_for_seed(out_entropy, out_entropy_len); + } +} + #define ENTROPY_READ_LEN \ (/* last_block size */ 16 + CTR_DRBG_ENTROPY_LEN * BORINGSSL_FIPS_OVERREAD) @@ -143,7 +159,7 @@ void RAND_need_entropy(size_t bytes_needed) { if (get_seed_from_daemon(buf, todo)) { want_additional_input = 1; } else { - CRYPTO_get_seed_entropy(buf, todo, &want_additional_input); + passive_get_seed_entropy(buf, todo, &want_additional_input); } if (boringssl_fips_break_test("CRNG")) { @@ -152,7 +168,7 @@ void RAND_need_entropy(size_t bytes_needed) { OPENSSL_memset(buf, 0, todo); } - RAND_load_entropy(buf, todo, want_additional_input); + BCM_rand_load_entropy(buf, todo, want_additional_input); } #endif // FIPS diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/rand_extra.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/rand_extra.c index 61e685e5..a0b9f10b 100644 --- a/Sources/CCryptoBoringSSL/crypto/rand_extra/rand_extra.c +++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/rand_extra.c @@ -12,11 +12,21 @@ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include + #include -#include +#include "../bcm_support.h" +#include "../fipsmodule/bcm_interface.h" +int RAND_bytes(uint8_t *buf, size_t len) { + BCM_rand_bytes(buf, len); + return 1; +} + +int RAND_pseudo_bytes(uint8_t *buf, size_t len) { return RAND_bytes(buf, len); } + void RAND_seed(const void *buf, int num) { // OpenSSH calls |RAND_seed| before jailing on the assumption that any needed // file descriptors etc will be opened. @@ -28,7 +38,7 @@ int RAND_load_file(const char *path, long num) { if (num < 0) { // read the "whole file" return 1; } else if (num <= INT_MAX) { - return (int) num; + return (int)num; } else { return INT_MAX; } @@ -38,37 +48,30 @@ const char *RAND_file_name(char *buf, size_t num) { return NULL; } void RAND_add(const void *buf, int num, double entropy) {} -int RAND_egd(const char *path) { - return 255; -} +int RAND_egd(const char *path) { return 255; } -int RAND_poll(void) { - return 1; -} +int RAND_poll(void) { return 1; } -int RAND_status(void) { - return 1; -} +int RAND_status(void) { return 1; } static const struct rand_meth_st kSSLeayMethod = { - RAND_seed, - RAND_bytes, - RAND_cleanup, - RAND_add, - RAND_pseudo_bytes, - RAND_status, + RAND_seed, RAND_bytes, RAND_cleanup, + RAND_add, RAND_pseudo_bytes, RAND_status, }; -RAND_METHOD *RAND_SSLeay(void) { - return (RAND_METHOD*) &kSSLeayMethod; -} +RAND_METHOD *RAND_SSLeay(void) { return (RAND_METHOD *)&kSSLeayMethod; } -RAND_METHOD *RAND_OpenSSL(void) { - return RAND_SSLeay(); -} +RAND_METHOD *RAND_OpenSSL(void) { return RAND_SSLeay(); } const RAND_METHOD *RAND_get_rand_method(void) { return RAND_SSLeay(); } int RAND_set_rand_method(const RAND_METHOD *method) { return 1; } void RAND_cleanup(void) {} + +void RAND_get_system_entropy_for_custom_prng(uint8_t *buf, size_t len) { + if (len > 256) { + abort(); + } + CRYPTO_sysrand_for_seed(buf, len); +} diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/sysrand_internal.h b/Sources/CCryptoBoringSSL/crypto/rand_extra/sysrand_internal.h new file mode 100644 index 00000000..94e99ef1 --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/sysrand_internal.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2024, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_CRYPTO_SYSRAND_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_SYSRAND_INTERNAL_H + +#include + +#if defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE) +#define OPENSSL_RAND_DETERMINISTIC +#elif defined(OPENSSL_TRUSTY) +#define OPENSSL_RAND_TRUSTY +#elif defined(OPENSSL_WINDOWS) +#define OPENSSL_RAND_WINDOWS +#elif defined(OPENSSL_LINUX) +#define OPENSSL_RAND_URANDOM +#elif defined(OPENSSL_APPLE) && !defined(OPENSSL_MACOS) +// Unlike macOS, iOS and similar hide away getentropy(). +#define OPENSSL_RAND_IOS +#else +// By default if you are integrating BoringSSL we expect you to +// provide getentropy from the header file. +#define OPENSSL_RAND_GETENTROPY +#endif + +#endif // OPENSSL_HEADER_CRYPTO__SYSRAND_INTERNAL_H diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/trusty.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/trusty.c index 88be7cbc..57e4148a 100644 --- a/Sources/CCryptoBoringSSL/crypto/rand_extra/trusty.c +++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/trusty.c @@ -14,7 +14,8 @@ #include -#include "../fipsmodule/rand/internal.h" +#include "../bcm_support.h" +#include "sysrand_internal.h" #if defined(OPENSSL_RAND_TRUSTY) #include @@ -25,12 +26,19 @@ #include +void CRYPTO_init_sysrand(void) {} + void CRYPTO_sysrand(uint8_t *out, size_t requested) { if (trusty_rng_hw_rand(out, requested) != NO_ERROR) { abort(); } } +int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) { + CRYPTO_sysrand(buf, len); + return 1; +} + void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) { CRYPTO_sysrand(out, requested); } diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/urandom.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/urandom.c similarity index 91% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/urandom.c rename to Sources/CCryptoBoringSSL/crypto/rand_extra/urandom.c index 5af61e91..eabb7e8d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/urandom.c +++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/urandom.c @@ -18,7 +18,8 @@ #include -#include "internal.h" +#include "../bcm_support.h" +#include "sysrand_internal.h" #if defined(OPENSSL_RAND_URANDOM) @@ -62,8 +63,7 @@ #include #include "getrandom_fillin.h" -#include "../delocate.h" -#include "../../internal.h" +#include "../internal.h" #if defined(USE_NR_getrandom) @@ -96,17 +96,17 @@ static ssize_t boringssl_getrandom(void *buf, size_t buf_len, unsigned flags) { static const int kHaveGetrandom = -3; // urandom_fd is a file descriptor to /dev/urandom. It's protected by |once|. -DEFINE_BSS_GET(int, urandom_fd) +static int urandom_fd; #if defined(USE_NR_getrandom) // getrandom_ready is one if |getrandom| had been initialized by the time // |init_once| was called and zero otherwise. -DEFINE_BSS_GET(int, getrandom_ready) +static int getrandom_ready; // extra_getrandom_flags_for_seed contains a value that is ORed into the flags // for getrandom() when reading entropy for a seed. -DEFINE_BSS_GET(int, extra_getrandom_flags_for_seed) +static int extra_getrandom_flags_for_seed; // On Android, check a system property to decide whether to set // |extra_getrandom_flags_for_seed| otherwise they will default to zero. If @@ -123,14 +123,14 @@ static void maybe_set_extra_getrandom_flags(void) { value[length] = 0; if (OPENSSL_strcasecmp(value, "true") == 0) { - *extra_getrandom_flags_for_seed_bss_get() = GRND_RANDOM; + extra_getrandom_flags_for_seed = GRND_RANDOM; } #endif } #endif // USE_NR_getrandom -DEFINE_STATIC_ONCE(rand_once) +static CRYPTO_once_t rand_once = CRYPTO_ONCE_INIT; // init_once initializes the state of this module to values previously // requested. This is the only function that modifies |urandom_fd|, which may be @@ -142,7 +142,7 @@ static void init_once(void) { ssize_t getrandom_ret = boringssl_getrandom(&dummy, sizeof(dummy), GRND_NONBLOCK); if (getrandom_ret == 1) { - *getrandom_ready_bss_get() = 1; + getrandom_ready = 1; have_getrandom = 1; } else if (getrandom_ret == -1 && errno == EAGAIN) { // We have getrandom, but the entropy pool has not been initialized yet. @@ -157,7 +157,7 @@ static void init_once(void) { } if (have_getrandom) { - *urandom_fd_bss_get() = kHaveGetrandom; + urandom_fd = kHaveGetrandom; maybe_set_extra_getrandom_flags(); return; } @@ -185,19 +185,19 @@ static void init_once(void) { abort(); } - *urandom_fd_bss_get() = fd; + urandom_fd = fd; } -DEFINE_STATIC_ONCE(wait_for_entropy_once) +static CRYPTO_once_t wait_for_entropy_once = CRYPTO_ONCE_INIT; static void wait_for_entropy(void) { - int fd = *urandom_fd_bss_get(); + int fd = urandom_fd; if (fd == kHaveGetrandom) { // |getrandom| and |getentropy| support blocking in |fill_with_entropy| // directly. For |getrandom|, we first probe with a non-blocking call to aid // debugging. #if defined(USE_NR_getrandom) - if (*getrandom_ready_bss_get()) { + if (getrandom_ready) { // The entropy pool was already initialized in |init_once|. return; } @@ -256,13 +256,13 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) { #if defined (USE_NR_getrandom) if (seed) { - getrandom_flags |= *extra_getrandom_flags_for_seed_bss_get(); + getrandom_flags |= extra_getrandom_flags_for_seed; } #endif CRYPTO_init_sysrand(); if (block) { - CRYPTO_once(wait_for_entropy_once_bss_get(), wait_for_entropy); + CRYPTO_once(&wait_for_entropy_once, wait_for_entropy); } // Clear |errno| so it has defined value if |read| or |getrandom| @@ -271,7 +271,7 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) { while (len > 0) { ssize_t r; - if (*urandom_fd_bss_get() == kHaveGetrandom) { + if (urandom_fd == kHaveGetrandom) { #if defined(USE_NR_getrandom) r = boringssl_getrandom(out, len, getrandom_flags); #else // USE_NR_getrandom @@ -280,7 +280,7 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) { #endif } else { do { - r = read(*urandom_fd_bss_get(), out, len); + r = read(urandom_fd, out, len); } while (r == -1 && errno == EINTR); } @@ -295,7 +295,7 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) { } void CRYPTO_init_sysrand(void) { - CRYPTO_once(rand_once_bss_get(), init_once); + CRYPTO_once(&rand_once, init_once); } // CRYPTO_sysrand puts |requested| random bytes into |out|. diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/windows.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/windows.c index d7517af4..89351610 100644 --- a/Sources/CCryptoBoringSSL/crypto/rand_extra/windows.c +++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/windows.c @@ -14,7 +14,9 @@ #include -#include "../fipsmodule/rand/internal.h" +#include "../bcm_support.h" +#include "../internal.h" +#include "sysrand_internal.h" #if defined(OPENSSL_RAND_WINDOWS) @@ -88,6 +90,11 @@ void CRYPTO_sysrand(uint8_t *out, size_t requested) { #endif // WINAPI_PARTITION_APP && !WINAPI_PARTITION_DESKTOP +int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) { + CRYPTO_sysrand(buf, len); + return 1; +} + void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) { CRYPTO_sysrand(out, requested); } diff --git a/Sources/CCryptoBoringSSL/crypto/rsa_extra/internal.h b/Sources/CCryptoBoringSSL/crypto/rsa_extra/internal.h index 6317cfc0..b47b9638 100644 --- a/Sources/CCryptoBoringSSL/crypto/rsa_extra/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/rsa_extra/internal.h @@ -58,6 +58,8 @@ #ifndef OPENSSL_HEADER_RSA_EXTRA_INTERNAL_H #define OPENSSL_HEADER_RSA_EXTRA_INTERNAL_H +#include + #if defined(__cplusplus) extern "C" { #endif diff --git a/Sources/CCryptoBoringSSL/crypto/sha/sha1.c b/Sources/CCryptoBoringSSL/crypto/sha/sha1.c new file mode 100644 index 00000000..f22b0683 --- /dev/null +++ b/Sources/CCryptoBoringSSL/crypto/sha/sha1.c @@ -0,0 +1,52 @@ +/* Copyright (c) 2024, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include + +#include + +#include "../fipsmodule/bcm_interface.h" + +int SHA1_Init(SHA_CTX *sha) { + BCM_sha1_init(sha); + return 1; +} + +int SHA1_Update(SHA_CTX *sha, const void *data, size_t len) { + BCM_sha1_update(sha, data, len); + return 1; +} + +int SHA1_Final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *sha) { + BCM_sha1_final(out, sha); + return 1; +} + +uint8_t *SHA1(const uint8_t *data, size_t len, uint8_t out[SHA_DIGEST_LENGTH]) { + SHA_CTX ctx; + BCM_sha1_init(&ctx); + BCM_sha1_update(&ctx, data, len); + BCM_sha1_final(out, &ctx); + OPENSSL_cleanse(&ctx, sizeof(ctx)); + return out; +} + +void SHA1_Transform(SHA_CTX *sha, const uint8_t block[SHA_CBLOCK]) { + BCM_sha1_transform(sha, block); +} + +void CRYPTO_fips_186_2_prf(uint8_t *out, size_t out_len, + const uint8_t xkey[SHA_DIGEST_LENGTH]) { + BCM_fips_186_2_prf(out, out_len, xkey); +} diff --git a/Sources/CCryptoBoringSSL/crypto/spx/spx.c b/Sources/CCryptoBoringSSL/crypto/spx/spx.c index bc2b3d22..0f5a5652 100644 --- a/Sources/CCryptoBoringSSL/crypto/spx/spx.c +++ b/Sources/CCryptoBoringSSL/crypto/spx/spx.c @@ -16,24 +16,25 @@ #include +#define OPENSSL_UNSTABLE_EXPERIMENTAL_SPX #include #include -#include "./address.h" -#include "./fors.h" -#include "./merkle.h" -#include "./params.h" +#include "./spx_address.h" +#include "./spx_fors.h" +#include "./spx_merkle.h" +#include "./spx_params.h" #include "./spx_util.h" -#include "./thash.h" +#include "./spx_thash.h" -void spx_generate_key(uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES], +void SPX_generate_key(uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES], uint8_t out_secret_key[SPX_SECRET_KEY_BYTES]) { uint8_t seed[3 * SPX_N]; RAND_bytes(seed, 3 * SPX_N); - spx_generate_key_from_seed(out_public_key, out_secret_key, seed); + SPX_generate_key_from_seed(out_public_key, out_secret_key, seed); } -void spx_generate_key_from_seed(uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES], +void SPX_generate_key_from_seed(uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES], uint8_t out_secret_key[SPX_SECRET_KEY_BYTES], const uint8_t seed[3 * SPX_N]) { // Initialize SK.seed || SK.prf || PK.seed from seed. @@ -51,7 +52,7 @@ void spx_generate_key_from_seed(uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES], memcpy(out_secret_key + 3 * SPX_N, out_public_key + SPX_N, SPX_N); } -void spx_sign(uint8_t out_signature[SPX_SIGNATURE_BYTES], +void SPX_sign(uint8_t out_signature[SPX_SIGNATURE_BYTES], const uint8_t secret_key[SPX_SECRET_KEY_BYTES], const uint8_t *msg, size_t msg_len, int randomized) { uint8_t addr[32] = {0}; @@ -102,7 +103,7 @@ void spx_sign(uint8_t out_signature[SPX_SIGNATURE_BYTES], idx_leaf, sk_seed, pk_seed); } -int spx_verify(const uint8_t signature[SPX_SIGNATURE_BYTES], +int SPX_verify(const uint8_t signature[SPX_SIGNATURE_BYTES], const uint8_t public_key[SPX_SECRET_KEY_BYTES], const uint8_t *msg, size_t msg_len) { uint8_t addr[32] = {0}; diff --git a/Sources/CCryptoBoringSSL/crypto/spx/address.c b/Sources/CCryptoBoringSSL/crypto/spx/spx_address.c similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/spx/address.c rename to Sources/CCryptoBoringSSL/crypto/spx/spx_address.c index 8c489b1b..4e691572 100644 --- a/Sources/CCryptoBoringSSL/crypto/spx/address.c +++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_address.c @@ -17,7 +17,7 @@ #include #include "../internal.h" -#include "./address.h" +#include "./spx_address.h" #include "./spx_util.h" diff --git a/Sources/CCryptoBoringSSL/crypto/spx/address.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_address.h similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/spx/address.h rename to Sources/CCryptoBoringSSL/crypto/spx/spx_address.h diff --git a/Sources/CCryptoBoringSSL/crypto/spx/fors.c b/Sources/CCryptoBoringSSL/crypto/spx/spx_fors.c similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/spx/fors.c rename to Sources/CCryptoBoringSSL/crypto/spx/spx_fors.c index 8dcbb5c7..e270b2bd 100644 --- a/Sources/CCryptoBoringSSL/crypto/spx/fors.c +++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_fors.c @@ -16,11 +16,11 @@ #include -#include "./address.h" -#include "./fors.h" -#include "./params.h" +#include "./spx_address.h" +#include "./spx_fors.h" +#include "./spx_params.h" #include "./spx_util.h" -#include "./thash.h" +#include "./spx_thash.h" void spx_fors_sk_gen(uint8_t *fors_sk, uint32_t idx, const uint8_t sk_seed[SPX_N], const uint8_t pk_seed[SPX_N], diff --git a/Sources/CCryptoBoringSSL/crypto/spx/fors.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_fors.h similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/spx/fors.h rename to Sources/CCryptoBoringSSL/crypto/spx/spx_fors.h index 77d677d9..cb6003e4 100644 --- a/Sources/CCryptoBoringSSL/crypto/spx/fors.h +++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_fors.h @@ -17,7 +17,7 @@ #include -#include "./params.h" +#include "./spx_params.h" #if defined(__cplusplus) extern "C" { diff --git a/Sources/CCryptoBoringSSL/crypto/spx/merkle.c b/Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.c similarity index 97% rename from Sources/CCryptoBoringSSL/crypto/spx/merkle.c rename to Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.c index 5f699923..02d3e214 100644 --- a/Sources/CCryptoBoringSSL/crypto/spx/merkle.c +++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.c @@ -16,11 +16,11 @@ #include -#include "./address.h" -#include "./merkle.h" -#include "./params.h" -#include "./thash.h" -#include "./wots.h" +#include "./spx_address.h" +#include "./spx_merkle.h" +#include "./spx_params.h" +#include "./spx_thash.h" +#include "./spx_wots.h" void spx_treehash(uint8_t out_pk[SPX_N], const uint8_t sk_seed[SPX_N], uint32_t i /*target node index*/, diff --git a/Sources/CCryptoBoringSSL/crypto/spx/merkle.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.h similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/spx/merkle.h rename to Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.h index 7c422953..d659ab0a 100644 --- a/Sources/CCryptoBoringSSL/crypto/spx/merkle.h +++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.h @@ -19,7 +19,7 @@ #include -#include "./params.h" +#include "./spx_params.h" #if defined(__cplusplus) extern "C" { diff --git a/Sources/CCryptoBoringSSL/crypto/spx/params.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_params.h similarity index 100% rename from Sources/CCryptoBoringSSL/crypto/spx/params.h rename to Sources/CCryptoBoringSSL/crypto/spx/spx_params.h diff --git a/Sources/CCryptoBoringSSL/crypto/spx/thash.c b/Sources/CCryptoBoringSSL/crypto/spx/spx_thash.c similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/spx/thash.c rename to Sources/CCryptoBoringSSL/crypto/spx/spx_thash.c index e9a8ac8b..0fb86470 100644 --- a/Sources/CCryptoBoringSSL/crypto/spx/thash.c +++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_thash.c @@ -20,9 +20,9 @@ #include -#include "./params.h" +#include "./spx_params.h" #include "./spx_util.h" -#include "./thash.h" +#include "./spx_thash.h" static void spx_thash(uint8_t *output, const uint8_t *input, size_t input_blocks, const uint8_t pk_seed[SPX_N], diff --git a/Sources/CCryptoBoringSSL/crypto/spx/thash.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_thash.h similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/spx/thash.h rename to Sources/CCryptoBoringSSL/crypto/spx/spx_thash.h index 8189d727..5ba46c3f 100644 --- a/Sources/CCryptoBoringSSL/crypto/spx/thash.h +++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_thash.h @@ -17,7 +17,7 @@ #include -#include "./params.h" +#include "./spx_params.h" #if defined(__cplusplus) extern "C" { diff --git a/Sources/CCryptoBoringSSL/crypto/spx/wots.c b/Sources/CCryptoBoringSSL/crypto/spx/spx_wots.c similarity index 97% rename from Sources/CCryptoBoringSSL/crypto/spx/wots.c rename to Sources/CCryptoBoringSSL/crypto/spx/spx_wots.c index 21f35b72..7b840046 100644 --- a/Sources/CCryptoBoringSSL/crypto/spx/wots.c +++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_wots.c @@ -18,11 +18,11 @@ #include #include -#include "./address.h" -#include "./params.h" +#include "./spx_address.h" +#include "./spx_params.h" #include "./spx_util.h" -#include "./thash.h" -#include "./wots.h" +#include "./spx_thash.h" +#include "./spx_wots.h" // Chaining function used in WOTS+. static void chain(uint8_t *output, const uint8_t *input, uint32_t start, diff --git a/Sources/CCryptoBoringSSL/crypto/spx/wots.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_wots.h similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/spx/wots.h rename to Sources/CCryptoBoringSSL/crypto/spx/spx_wots.h index f40d5625..e9087f3f 100644 --- a/Sources/CCryptoBoringSSL/crypto/spx/wots.h +++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_wots.h @@ -17,7 +17,7 @@ #include -#include "./params.h" +#include "./spx_params.h" #if defined(__cplusplus) extern "C" { diff --git a/Sources/CCryptoBoringSSL/crypto/x509/by_dir.c b/Sources/CCryptoBoringSSL/crypto/x509/by_dir.c index d7cd8048..6b71063e 100644 --- a/Sources/CCryptoBoringSSL/crypto/x509/by_dir.c +++ b/Sources/CCryptoBoringSSL/crypto/x509/by_dir.c @@ -313,14 +313,20 @@ static int get_cert_by_subject(X509_LOOKUP *xl, int type, X509_NAME *name, k); if (type == X509_LU_X509) { if ((X509_load_cert_file(xl, b->data, ent->dir_type)) == 0) { + // Don't expose the lower level error, All of these boil + // down to "we could not find a CA". + ERR_clear_error(); break; } } else if (type == X509_LU_CRL) { if ((X509_load_crl_file(xl, b->data, ent->dir_type)) == 0) { + // Don't expose the lower level error, All of these boil + // down to "we could not find a CRL". + ERR_clear_error(); break; } } - // else case will caught higher up + // The lack of a CA or CRL will be caught higher up k++; } diff --git a/Sources/CCryptoBoringSSL/crypto/x509/internal.h b/Sources/CCryptoBoringSSL/crypto/x509/internal.h index 6a1beefc..ad42c3ee 100644 --- a/Sources/CCryptoBoringSSL/crypto/x509/internal.h +++ b/Sources/CCryptoBoringSSL/crypto/x509/internal.h @@ -341,8 +341,6 @@ struct x509_store_st { // Callbacks for various operations X509_STORE_CTX_verify_cb verify_cb; // error callback - X509_STORE_CTX_get_crl_fn get_crl; // retrieve CRL - X509_STORE_CTX_check_crl_fn check_crl; // Check CRL validity CRYPTO_refcount_t references; } /* X509_STORE */; @@ -374,8 +372,6 @@ struct x509_store_ctx_st { // Callbacks for various operations X509_STORE_CTX_verify_cb verify_cb; // error callback - X509_STORE_CTX_get_crl_fn get_crl; // retrieve CRL - X509_STORE_CTX_check_crl_fn check_crl; // Check CRL validity // The following is built up int last_untrusted; // index of last untrusted cert @@ -422,7 +418,7 @@ int x509_print_rsa_pss_params(BIO *bp, const X509_ALGOR *sigalg, int indent, // Signature algorithm functions. // x509_digest_sign_algorithm encodes the signing parameters of |ctx| as an -// AlgorithmIdentifer and saves the result in |algor|. It returns one on +// AlgorithmIdentifier and saves the result in |algor|. It returns one on // success, or zero on error. int x509_digest_sign_algorithm(EVP_MD_CTX *ctx, X509_ALGOR *algor); diff --git a/Sources/CCryptoBoringSSL/crypto/x509/rsa_pss.c b/Sources/CCryptoBoringSSL/crypto/x509/rsa_pss.c index 9c868ce4..5c74fb5d 100644 --- a/Sources/CCryptoBoringSSL/crypto/x509/rsa_pss.c +++ b/Sources/CCryptoBoringSSL/crypto/x509/rsa_pss.c @@ -125,7 +125,11 @@ static int rsa_md_to_algor(X509_ALGOR **palg, const EVP_MD *md) { if (*palg == NULL) { return 0; } - X509_ALGOR_set_md(*palg, md); + if (!X509_ALGOR_set_md(*palg, md)) { + X509_ALGOR_free(*palg); + *palg = NULL; + return 0; + } return 1; } diff --git a/Sources/CCryptoBoringSSL/crypto/x509/v3_utl.c b/Sources/CCryptoBoringSSL/crypto/x509/v3_utl.c index b65f67c5..7da46d17 100644 --- a/Sources/CCryptoBoringSSL/crypto/x509/v3_utl.c +++ b/Sources/CCryptoBoringSSL/crypto/x509/v3_utl.c @@ -82,10 +82,10 @@ static void str_free(OPENSSL_STRING str); static int append_ia5(STACK_OF(OPENSSL_STRING) **sk, const ASN1_IA5STRING *email); -static int ipv4_from_asc(unsigned char v4[4], const char *in); -static int ipv6_from_asc(unsigned char v6[16], const char *in); +static int ipv4_from_asc(uint8_t v4[4], const char *in); +static int ipv6_from_asc(uint8_t v6[16], const char *in); static int ipv6_cb(const char *elem, size_t len, void *usr); -static int ipv6_hex(unsigned char *out, const char *in, size_t inlen); +static int ipv6_hex(uint8_t *out, const char *in, size_t inlen); // Add a CONF_VALUE name value pair to stack @@ -1154,7 +1154,7 @@ ASN1_OCTET_STRING *a2i_IPADDRESS_NC(const char *ipasc) { return NULL; } -int x509v3_a2i_ipadd(unsigned char ipout[16], const char *ipasc) { +int x509v3_a2i_ipadd(uint8_t ipout[16], const char *ipasc) { // If string contains a ':' assume IPv6 if (strchr(ipasc, ':')) { @@ -1170,25 +1170,58 @@ int x509v3_a2i_ipadd(unsigned char ipout[16], const char *ipasc) { } } -static int ipv4_from_asc(unsigned char v4[4], const char *in) { - int a0, a1, a2, a3; - if (sscanf(in, "%d.%d.%d.%d", &a0, &a1, &a2, &a3) != 4) { +// get_ipv4_component consumes one IPv4 component, terminated by either '.' or +// the end of the string, from |*str|. On success, it returns one, sets |*out| +// to the component, and advances |*str| to the first unconsumed character. On +// invalid input, it returns zero. +static int get_ipv4_component(uint8_t *out_byte, const char **str) { + // Store a slightly larger intermediary so the overflow check is easier. + uint32_t out = 0; + for (;;) { + if (!OPENSSL_isdigit(**str)) { + return 0; + } + out = (out * 10) + (**str - '0'); + if (out > 255) { + // Components must be 8-bit. + return 0; + } + (*str)++; + if ((**str) == '.' || (**str) == '\0') { + *out_byte = (uint8_t)out; + return 1; + } + if (out == 0) { + // Reject extra leading zeros. Parsers sometimes treat them as octal, so + // accepting them would misinterpret input. + return 0; + } + } +} + +// get_ipv4_dot consumes a '.' from |*str| and advances it. It returns one on +// success and zero if |*str| does not point to a '.'. +static int get_ipv4_dot(const char **str) { + if (**str != '.') { return 0; } - if ((a0 < 0) || (a0 > 255) || (a1 < 0) || (a1 > 255) || (a2 < 0) || - (a2 > 255) || (a3 < 0) || (a3 > 255)) { + (*str)++; + return 1; +} + +static int ipv4_from_asc(uint8_t v4[4], const char *in) { + if (!get_ipv4_component(&v4[0], &in) || !get_ipv4_dot(&in) || + !get_ipv4_component(&v4[1], &in) || !get_ipv4_dot(&in) || + !get_ipv4_component(&v4[2], &in) || !get_ipv4_dot(&in) || + !get_ipv4_component(&v4[3], &in) || *in != '\0') { return 0; } - v4[0] = a0; - v4[1] = a1; - v4[2] = a2; - v4[3] = a3; return 1; } typedef struct { // Temporary store for IPV6 output - unsigned char tmp[16]; + uint8_t tmp[16]; // Total number of bytes in tmp int total; // The position of a zero (corresponding to '::') @@ -1197,7 +1230,7 @@ typedef struct { int zero_cnt; } IPV6_STAT; -static int ipv6_from_asc(unsigned char v6[16], const char *in) { +static int ipv6_from_asc(uint8_t v6[16], const char *in) { IPV6_STAT v6stat; v6stat.total = 0; v6stat.zero_pos = -1; @@ -1305,7 +1338,7 @@ static int ipv6_cb(const char *elem, size_t len, void *usr) { // Convert a string of up to 4 hex digits into the corresponding IPv6 form. -static int ipv6_hex(unsigned char *out, const char *in, size_t inlen) { +static int ipv6_hex(uint8_t *out, const char *in, size_t inlen) { if (inlen > 4) { return 0; } diff --git a/Sources/CCryptoBoringSSL/crypto/x509/x509_lu.c b/Sources/CCryptoBoringSSL/crypto/x509/x509_lu.c index 75444603..29c22d83 100644 --- a/Sources/CCryptoBoringSSL/crypto/x509/x509_lu.c +++ b/Sources/CCryptoBoringSSL/crypto/x509/x509_lu.c @@ -594,16 +594,6 @@ void X509_STORE_set_verify_cb(X509_STORE *ctx, ctx->verify_cb = verify_cb; } -void X509_STORE_set_get_crl(X509_STORE *ctx, - X509_STORE_CTX_get_crl_fn get_crl) { - ctx->get_crl = get_crl; -} - -void X509_STORE_set_check_crl(X509_STORE *ctx, - X509_STORE_CTX_check_crl_fn check_crl) { - ctx->check_crl = check_crl; -} - X509_STORE *X509_STORE_CTX_get0_store(const X509_STORE_CTX *ctx) { return ctx->ctx; } diff --git a/Sources/CCryptoBoringSSL/crypto/x509/x509_trs.c b/Sources/CCryptoBoringSSL/crypto/x509/x509_trs.c index 9d5118d3..af5a5b5e 100644 --- a/Sources/CCryptoBoringSSL/crypto/x509/x509_trs.c +++ b/Sources/CCryptoBoringSSL/crypto/x509/x509_trs.c @@ -67,14 +67,14 @@ typedef struct x509_trust_st X509_TRUST; struct x509_trust_st { int trust; - int (*check_trust)(const X509_TRUST *, X509 *, int); + int (*check_trust)(const X509_TRUST *, X509 *); int nid; } /* X509_TRUST */; -static int trust_1oidany(const X509_TRUST *trust, X509 *x, int flags); -static int trust_compat(const X509_TRUST *trust, X509 *x, int flags); +static int trust_1oidany(const X509_TRUST *trust, X509 *x); +static int trust_compat(const X509_TRUST *trust, X509 *x); -static int obj_trust(int id, X509 *x, int flags); +static int obj_trust(int id, X509 *x); static const X509_TRUST trstandard[] = { {X509_TRUST_COMPAT, trust_compat, 0}, @@ -99,36 +99,36 @@ int X509_check_trust(X509 *x, int id, int flags) { } // We get this as a default value if (id == 0) { - int rv = obj_trust(NID_anyExtendedKeyUsage, x, 0); + int rv = obj_trust(NID_anyExtendedKeyUsage, x); if (rv != X509_TRUST_UNTRUSTED) { return rv; } - return trust_compat(NULL, x, 0); + return trust_compat(NULL, x); } const X509_TRUST *pt = X509_TRUST_get0(id); if (pt == NULL) { // Unknown trust IDs are silently reintrepreted as NIDs. This is unreachable // from the certificate verifier itself, but wpa_supplicant relies on it. // Note this relies on commonly-used NIDs and trust IDs not colliding. - return obj_trust(id, x, flags); + return obj_trust(id, x); } - return pt->check_trust(pt, x, flags); + return pt->check_trust(pt, x); } int X509_is_valid_trust_id(int trust) { return X509_TRUST_get0(trust) != NULL; } -static int trust_1oidany(const X509_TRUST *trust, X509 *x, int flags) { +static int trust_1oidany(const X509_TRUST *trust, X509 *x) { if (x->aux && (x->aux->trust || x->aux->reject)) { - return obj_trust(trust->nid, x, flags); + return obj_trust(trust->nid, x); } // we don't have any trust settings: for compatibility we return trusted // if it is self signed - return trust_compat(trust, x, flags); + return trust_compat(trust, x); } -static int trust_compat(const X509_TRUST *trust, X509 *x, int flags) { +static int trust_compat(const X509_TRUST *trust, X509 *x) { if (!x509v3_cache_extensions(x)) { return X509_TRUST_UNTRUSTED; } @@ -139,25 +139,21 @@ static int trust_compat(const X509_TRUST *trust, X509 *x, int flags) { } } -static int obj_trust(int id, X509 *x, int flags) { +static int obj_trust(int id, X509 *x) { X509_CERT_AUX *ax = x->aux; if (!ax) { return X509_TRUST_UNTRUSTED; } - if (ax->reject) { - for (size_t i = 0; i < sk_ASN1_OBJECT_num(ax->reject); i++) { - const ASN1_OBJECT *obj = sk_ASN1_OBJECT_value(ax->reject, i); - if (OBJ_obj2nid(obj) == id) { - return X509_TRUST_REJECTED; - } + for (size_t i = 0; i < sk_ASN1_OBJECT_num(ax->reject); i++) { + const ASN1_OBJECT *obj = sk_ASN1_OBJECT_value(ax->reject, i); + if (OBJ_obj2nid(obj) == id) { + return X509_TRUST_REJECTED; } } - if (ax->trust) { - for (size_t i = 0; i < sk_ASN1_OBJECT_num(ax->trust); i++) { - const ASN1_OBJECT *obj = sk_ASN1_OBJECT_value(ax->trust, i); - if (OBJ_obj2nid(obj) == id) { - return X509_TRUST_TRUSTED; - } + for (size_t i = 0; i < sk_ASN1_OBJECT_num(ax->trust); i++) { + const ASN1_OBJECT *obj = sk_ASN1_OBJECT_value(ax->trust, i); + if (OBJ_obj2nid(obj) == id) { + return X509_TRUST_TRUSTED; } } return X509_TRUST_UNTRUSTED; diff --git a/Sources/CCryptoBoringSSL/crypto/x509/x509_vfy.c b/Sources/CCryptoBoringSSL/crypto/x509/x509_vfy.c index e9b290a9..20f26aa3 100644 --- a/Sources/CCryptoBoringSSL/crypto/x509/x509_vfy.c +++ b/Sources/CCryptoBoringSSL/crypto/x509/x509_vfy.c @@ -117,6 +117,7 @@ static int get_crl(X509_STORE_CTX *ctx, X509_CRL **pcrl, X509 *x); static int crl_akid_check(X509_STORE_CTX *ctx, X509_CRL *crl, X509 **pissuer, int *pcrl_score); static int crl_crldp_check(X509 *x, X509_CRL *crl, int crl_score); +static int check_crl(X509_STORE_CTX *ctx, X509_CRL *crl); static int cert_crl(X509_STORE_CTX *ctx, X509_CRL *crl, X509 *x); static int internal_verify(X509_STORE_CTX *ctx); @@ -769,17 +770,18 @@ static int check_cert(X509_STORE_CTX *ctx) { // Try to retrieve the relevant CRL. Note that |get_crl| sets // |current_crl_issuer| and |current_crl_score|, which |check_crl| then reads. // - // TODO(davidben): Remove these callbacks. gRPC currently sets them, but - // implements them incorrectly. It is not actually possible to implement - // |get_crl| from outside the library. - if (!ctx->get_crl(ctx, &crl, x)) { + // TODO(davidben): The awkward internal calling convention is a historical + // artifact of when these functions were user-overridable callbacks, even + // though there was no way to set them correctly. These callbacks have since + // been removed, so we can pass input and output parameters more directly. + if (!get_crl(ctx, &crl, x)) { ctx->error = X509_V_ERR_UNABLE_TO_GET_CRL; ok = call_verify_cb(0, ctx); goto err; } ctx->current_crl = crl; - if (!ctx->check_crl(ctx, crl) || // + if (!check_crl(ctx, crl) || // !cert_crl(ctx, crl, x)) { goto err; } @@ -1560,18 +1562,6 @@ int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *store, X509 *x509, ctx->verify_cb = null_callback; } - if (store->get_crl) { - ctx->get_crl = store->get_crl; - } else { - ctx->get_crl = get_crl; - } - - if (store->check_crl) { - ctx->check_crl = store->check_crl; - } else { - ctx->check_crl = check_crl; - } - return 1; err: diff --git a/Sources/CCryptoBoringSSL/crypto/x509/x_algor.c b/Sources/CCryptoBoringSSL/crypto/x509/x_algor.c index 8f4cc9aa..c9c61bcf 100644 --- a/Sources/CCryptoBoringSSL/crypto/x509/x_algor.c +++ b/Sources/CCryptoBoringSSL/crypto/x509/x_algor.c @@ -123,7 +123,7 @@ void X509_ALGOR_get0(const ASN1_OBJECT **out_obj, int *out_param_type, // Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD -void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md) { +int X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md) { int param_type; if (EVP_MD_flags(md) & EVP_MD_FLAG_DIGALGID_ABSENT) { @@ -132,7 +132,7 @@ void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md) { param_type = V_ASN1_NULL; } - X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL); + return X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL); } // X509_ALGOR_cmp returns 0 if |a| and |b| are equal and non-zero otherwise. diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-apple.S index 05e38f93..0d61efe4 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -868,7 +867,6 @@ L$one_lsb: .p2align 6 .text #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-linux.S index 1d65cc35..276d820a 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -883,7 +882,6 @@ _CET_ENDBR .align 64 .text #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-apple.S new file mode 100644 index 00000000..99b6bae4 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-apple.S @@ -0,0 +1,2495 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt +.align 4 +_aes_hw_encrypt: +L_aes_hw_encrypt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L000pic_for_function_hit +L000pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+1-L000pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 12(%esp),%edx + movups (%eax),%xmm2 + movl 240(%edx),%ecx + movl 8(%esp),%eax + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L001enc1_loop_1: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L001enc1_loop_1 +.byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%eax) + pxor %xmm2,%xmm2 + ret +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt +.align 4 +_aes_hw_decrypt: +L_aes_hw_decrypt_begin: + movl 4(%esp),%eax + movl 12(%esp),%edx + movups (%eax),%xmm2 + movl 240(%edx),%ecx + movl 8(%esp),%eax + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L002dec1_loop_2: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L002dec1_loop_2 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%eax) + pxor %xmm2,%xmm2 + ret +.private_extern __aesni_encrypt2 +.align 4 +__aesni_encrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L003enc2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz L003enc2_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + ret +.private_extern __aesni_decrypt2 +.align 4 +__aesni_decrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L004dec2_loop: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz L004dec2_loop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 + ret +.private_extern __aesni_encrypt3 +.align 4 +__aesni_encrypt3: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L005enc3_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + movups -16(%edx,%ecx,1),%xmm0 + jnz L005enc3_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 + ret +.private_extern __aesni_decrypt3 +.align 4 +__aesni_decrypt3: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L006dec3_loop: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 + movups -16(%edx,%ecx,1),%xmm0 + jnz L006dec3_loop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 +.byte 102,15,56,223,224 + ret +.private_extern __aesni_encrypt4 +.align 4 +__aesni_encrypt4: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + shll $4,%ecx + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +L007enc4_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%edx,%ecx,1),%xmm0 + jnz L007enc4_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 + ret +.private_extern __aesni_decrypt4 +.align 4 +__aesni_decrypt4: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + shll $4,%ecx + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +L008dec4_loop: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups -16(%edx,%ecx,1),%xmm0 + jnz L008dec4_loop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 +.byte 102,15,56,223,224 +.byte 102,15,56,223,232 + ret +.private_extern __aesni_encrypt6 +.align 4 +__aesni_encrypt6: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 +.byte 102,15,56,220,209 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 +.byte 102,15,56,220,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,220,225 + pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp L009_aesni_encrypt6_inner +.align 4,0x90 +L010enc6_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +L009_aesni_encrypt6_inner: +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +L_aesni_encrypt6_enter: + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%edx,%ecx,1),%xmm0 + jnz L010enc6_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 +.byte 102,15,56,221,240 +.byte 102,15,56,221,248 + ret +.private_extern __aesni_decrypt6 +.align 4 +__aesni_decrypt6: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 +.byte 102,15,56,222,209 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 +.byte 102,15,56,222,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,222,225 + pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp L011_aesni_decrypt6_inner +.align 4,0x90 +L012dec6_loop: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +L011_aesni_decrypt6_inner: +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +L_aesni_decrypt6_enter: + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups -16(%edx,%ecx,1),%xmm0 + jnz L012dec6_loop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 +.byte 102,15,56,223,224 +.byte 102,15,56,223,232 +.byte 102,15,56,223,240 +.byte 102,15,56,223,248 + ret +.globl _aes_hw_ecb_encrypt +.private_extern _aes_hw_ecb_encrypt +.align 4 +_aes_hw_ecb_encrypt: +L_aes_hw_ecb_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + andl $-16,%eax + jz L013ecb_ret + movl 240(%edx),%ecx + testl %ebx,%ebx + jz L014ecb_decrypt + movl %edx,%ebp + movl %ecx,%ebx + cmpl $96,%eax + jb L015ecb_enc_tail + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + subl $96,%eax + jmp L016ecb_enc_loop6_enter +.align 4,0x90 +L017ecb_enc_loop6: + movups %xmm2,(%edi) + movdqu (%esi),%xmm2 + movups %xmm3,16(%edi) + movdqu 16(%esi),%xmm3 + movups %xmm4,32(%edi) + movdqu 32(%esi),%xmm4 + movups %xmm5,48(%edi) + movdqu 48(%esi),%xmm5 + movups %xmm6,64(%edi) + movdqu 64(%esi),%xmm6 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi +L016ecb_enc_loop6_enter: + call __aesni_encrypt6 + movl %ebp,%edx + movl %ebx,%ecx + subl $96,%eax + jnc L017ecb_enc_loop6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + movups %xmm7,80(%edi) + leal 96(%edi),%edi + addl $96,%eax + jz L013ecb_ret +L015ecb_enc_tail: + movups (%esi),%xmm2 + cmpl $32,%eax + jb L018ecb_enc_one + movups 16(%esi),%xmm3 + je L019ecb_enc_two + movups 32(%esi),%xmm4 + cmpl $64,%eax + jb L020ecb_enc_three + movups 48(%esi),%xmm5 + je L021ecb_enc_four + movups 64(%esi),%xmm6 + xorps %xmm7,%xmm7 + call __aesni_encrypt6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + jmp L013ecb_ret +.align 4,0x90 +L018ecb_enc_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L022enc1_loop_3: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L022enc1_loop_3 +.byte 102,15,56,221,209 + movups %xmm2,(%edi) + jmp L013ecb_ret +.align 4,0x90 +L019ecb_enc_two: + call __aesni_encrypt2 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + jmp L013ecb_ret +.align 4,0x90 +L020ecb_enc_three: + call __aesni_encrypt3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + jmp L013ecb_ret +.align 4,0x90 +L021ecb_enc_four: + call __aesni_encrypt4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + jmp L013ecb_ret +.align 4,0x90 +L014ecb_decrypt: + movl %edx,%ebp + movl %ecx,%ebx + cmpl $96,%eax + jb L023ecb_dec_tail + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + subl $96,%eax + jmp L024ecb_dec_loop6_enter +.align 4,0x90 +L025ecb_dec_loop6: + movups %xmm2,(%edi) + movdqu (%esi),%xmm2 + movups %xmm3,16(%edi) + movdqu 16(%esi),%xmm3 + movups %xmm4,32(%edi) + movdqu 32(%esi),%xmm4 + movups %xmm5,48(%edi) + movdqu 48(%esi),%xmm5 + movups %xmm6,64(%edi) + movdqu 64(%esi),%xmm6 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi +L024ecb_dec_loop6_enter: + call __aesni_decrypt6 + movl %ebp,%edx + movl %ebx,%ecx + subl $96,%eax + jnc L025ecb_dec_loop6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + movups %xmm7,80(%edi) + leal 96(%edi),%edi + addl $96,%eax + jz L013ecb_ret +L023ecb_dec_tail: + movups (%esi),%xmm2 + cmpl $32,%eax + jb L026ecb_dec_one + movups 16(%esi),%xmm3 + je L027ecb_dec_two + movups 32(%esi),%xmm4 + cmpl $64,%eax + jb L028ecb_dec_three + movups 48(%esi),%xmm5 + je L029ecb_dec_four + movups 64(%esi),%xmm6 + xorps %xmm7,%xmm7 + call __aesni_decrypt6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + jmp L013ecb_ret +.align 4,0x90 +L026ecb_dec_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L030dec1_loop_4: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L030dec1_loop_4 +.byte 102,15,56,223,209 + movups %xmm2,(%edi) + jmp L013ecb_ret +.align 4,0x90 +L027ecb_dec_two: + call __aesni_decrypt2 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + jmp L013ecb_ret +.align 4,0x90 +L028ecb_dec_three: + call __aesni_decrypt3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + jmp L013ecb_ret +.align 4,0x90 +L029ecb_dec_four: + call __aesni_decrypt4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) +L013ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_ccm64_encrypt_blocks +.private_extern _aes_hw_ccm64_encrypt_blocks +.align 4 +_aes_hw_ccm64_encrypt_blocks: +L_aes_hw_ccm64_encrypt_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl 40(%esp),%ecx + movl %esp,%ebp + subl $60,%esp + andl $-16,%esp + movl %ebp,48(%esp) + movdqu (%ebx),%xmm7 + movdqu (%ecx),%xmm3 + movl 240(%edx),%ecx + movl $202182159,(%esp) + movl $134810123,4(%esp) + movl $67438087,8(%esp) + movl $66051,12(%esp) + movl $1,%ebx + xorl %ebp,%ebp + movl %ebx,16(%esp) + movl %ebp,20(%esp) + movl %ebp,24(%esp) + movl %ebp,28(%esp) + shll $4,%ecx + movl $16,%ebx + leal (%edx),%ebp + movdqa (%esp),%xmm5 + movdqa %xmm7,%xmm2 + leal 32(%edx,%ecx,1),%edx + subl %ecx,%ebx +.byte 102,15,56,0,253 +L031ccm64_enc_outer: + movups (%ebp),%xmm0 + movl %ebx,%ecx + movups (%esi),%xmm6 + xorps %xmm0,%xmm2 + movups 16(%ebp),%xmm1 + xorps %xmm6,%xmm0 + xorps %xmm0,%xmm3 + movups 32(%ebp),%xmm0 +L032ccm64_enc2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz L032ccm64_enc2_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + paddq 16(%esp),%xmm7 + decl %eax +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + leal 16(%esi),%esi + xorps %xmm2,%xmm6 + movdqa %xmm7,%xmm2 + movups %xmm6,(%edi) +.byte 102,15,56,0,213 + leal 16(%edi),%edi + jnz L031ccm64_enc_outer + movl 48(%esp),%esp + movl 40(%esp),%edi + movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_ccm64_decrypt_blocks +.private_extern _aes_hw_ccm64_decrypt_blocks +.align 4 +_aes_hw_ccm64_decrypt_blocks: +L_aes_hw_ccm64_decrypt_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl 40(%esp),%ecx + movl %esp,%ebp + subl $60,%esp + andl $-16,%esp + movl %ebp,48(%esp) + movdqu (%ebx),%xmm7 + movdqu (%ecx),%xmm3 + movl 240(%edx),%ecx + movl $202182159,(%esp) + movl $134810123,4(%esp) + movl $67438087,8(%esp) + movl $66051,12(%esp) + movl $1,%ebx + xorl %ebp,%ebp + movl %ebx,16(%esp) + movl %ebp,20(%esp) + movl %ebp,24(%esp) + movl %ebp,28(%esp) + movdqa (%esp),%xmm5 + movdqa %xmm7,%xmm2 + movl %edx,%ebp + movl %ecx,%ebx +.byte 102,15,56,0,253 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L033enc1_loop_5: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L033enc1_loop_5 +.byte 102,15,56,221,209 + shll $4,%ebx + movl $16,%ecx + movups (%esi),%xmm6 + paddq 16(%esp),%xmm7 + leal 16(%esi),%esi + subl %ebx,%ecx + leal 32(%ebp,%ebx,1),%edx + movl %ecx,%ebx + jmp L034ccm64_dec_outer +.align 4,0x90 +L034ccm64_dec_outer: + xorps %xmm2,%xmm6 + movdqa %xmm7,%xmm2 + movups %xmm6,(%edi) + leal 16(%edi),%edi +.byte 102,15,56,0,213 + subl $1,%eax + jz L035ccm64_dec_break + movups (%ebp),%xmm0 + movl %ebx,%ecx + movups 16(%ebp),%xmm1 + xorps %xmm0,%xmm6 + xorps %xmm0,%xmm2 + xorps %xmm6,%xmm3 + movups 32(%ebp),%xmm0 +L036ccm64_dec2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz L036ccm64_dec2_loop + movups (%esi),%xmm6 + paddq 16(%esp),%xmm7 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + leal 16(%esi),%esi + jmp L034ccm64_dec_outer +.align 4,0x90 +L035ccm64_dec_break: + movl 240(%ebp),%ecx + movl %ebp,%edx + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm6 + leal 32(%edx),%edx + xorps %xmm6,%xmm3 +L037enc1_loop_6: +.byte 102,15,56,220,217 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L037enc1_loop_6 +.byte 102,15,56,221,217 + movl 48(%esp),%esp + movl 40(%esp),%edi + movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks +.align 4 +_aes_hw_ctr32_encrypt_blocks: +L_aes_hw_ctr32_encrypt_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L038pic_for_function_hit +L038pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+0-L038pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $88,%esp + andl $-16,%esp + movl %ebp,80(%esp) + cmpl $1,%eax + je L039ctr32_one_shortcut + movdqu (%ebx),%xmm7 + movl $202182159,(%esp) + movl $134810123,4(%esp) + movl $67438087,8(%esp) + movl $66051,12(%esp) + movl $6,%ecx + xorl %ebp,%ebp + movl %ecx,16(%esp) + movl %ecx,20(%esp) + movl %ecx,24(%esp) + movl %ebp,28(%esp) +.byte 102,15,58,22,251,3 +.byte 102,15,58,34,253,3 + movl 240(%edx),%ecx + bswap %ebx + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqa (%esp),%xmm2 +.byte 102,15,58,34,195,0 + leal 3(%ebx),%ebp +.byte 102,15,58,34,205,0 + incl %ebx +.byte 102,15,58,34,195,1 + incl %ebp +.byte 102,15,58,34,205,1 + incl %ebx +.byte 102,15,58,34,195,2 + incl %ebp +.byte 102,15,58,34,205,2 + movdqa %xmm0,48(%esp) +.byte 102,15,56,0,194 + movdqu (%edx),%xmm6 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 + pshufd $192,%xmm0,%xmm2 + pshufd $128,%xmm0,%xmm3 + cmpl $6,%eax + jb L040ctr32_tail + pxor %xmm6,%xmm7 + shll $4,%ecx + movl $16,%ebx + movdqa %xmm7,32(%esp) + movl %edx,%ebp + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + subl $6,%eax + jmp L041ctr32_loop6 +.align 4,0x90 +L041ctr32_loop6: + pshufd $64,%xmm0,%xmm4 + movdqa 32(%esp),%xmm0 + pshufd $192,%xmm1,%xmm5 + pxor %xmm0,%xmm2 + pshufd $128,%xmm1,%xmm6 + pxor %xmm0,%xmm3 + pshufd $64,%xmm1,%xmm7 + movups 16(%ebp),%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 +.byte 102,15,56,220,209 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 +.byte 102,15,56,220,217 + movups 32(%ebp),%xmm0 + movl %ebx,%ecx +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + call L_aesni_encrypt6_enter + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps %xmm1,%xmm2 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm3 + movups %xmm2,(%edi) + movdqa 16(%esp),%xmm0 + xorps %xmm1,%xmm4 + movdqa 64(%esp),%xmm1 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + paddd %xmm0,%xmm1 + paddd 48(%esp),%xmm0 + movdqa (%esp),%xmm2 + movups 48(%esi),%xmm3 + movups 64(%esi),%xmm4 + xorps %xmm3,%xmm5 + movups 80(%esi),%xmm3 + leal 96(%esi),%esi + movdqa %xmm0,48(%esp) +.byte 102,15,56,0,194 + xorps %xmm4,%xmm6 + movups %xmm5,48(%edi) + xorps %xmm3,%xmm7 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 + movups %xmm6,64(%edi) + pshufd $192,%xmm0,%xmm2 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + pshufd $128,%xmm0,%xmm3 + subl $6,%eax + jnc L041ctr32_loop6 + addl $6,%eax + jz L042ctr32_ret + movdqu (%ebp),%xmm7 + movl %ebp,%edx + pxor 32(%esp),%xmm7 + movl 240(%ebp),%ecx +L040ctr32_tail: + por %xmm7,%xmm2 + cmpl $2,%eax + jb L043ctr32_one + pshufd $64,%xmm0,%xmm4 + por %xmm7,%xmm3 + je L044ctr32_two + pshufd $192,%xmm1,%xmm5 + por %xmm7,%xmm4 + cmpl $4,%eax + jb L045ctr32_three + pshufd $128,%xmm1,%xmm6 + por %xmm7,%xmm5 + je L046ctr32_four + por %xmm7,%xmm6 + call __aesni_encrypt6 + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps %xmm1,%xmm2 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm3 + movups 48(%esi),%xmm0 + xorps %xmm1,%xmm4 + movups 64(%esi),%xmm1 + xorps %xmm0,%xmm5 + movups %xmm2,(%edi) + xorps %xmm1,%xmm6 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + jmp L042ctr32_ret +.align 4,0x90 +L039ctr32_one_shortcut: + movups (%ebx),%xmm2 + movl 240(%edx),%ecx +L043ctr32_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L047enc1_loop_7: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L047enc1_loop_7 +.byte 102,15,56,221,209 + movups (%esi),%xmm6 + xorps %xmm2,%xmm6 + movups %xmm6,(%edi) + jmp L042ctr32_ret +.align 4,0x90 +L044ctr32_two: + call __aesni_encrypt2 + movups (%esi),%xmm5 + movups 16(%esi),%xmm6 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + jmp L042ctr32_ret +.align 4,0x90 +L045ctr32_three: + call __aesni_encrypt3 + movups (%esi),%xmm5 + movups 16(%esi),%xmm6 + xorps %xmm5,%xmm2 + movups 32(%esi),%xmm7 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + xorps %xmm7,%xmm4 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + jmp L042ctr32_ret +.align 4,0x90 +L046ctr32_four: + call __aesni_encrypt4 + movups (%esi),%xmm6 + movups 16(%esi),%xmm7 + movups 32(%esi),%xmm1 + xorps %xmm6,%xmm2 + movups 48(%esi),%xmm0 + xorps %xmm7,%xmm3 + movups %xmm2,(%edi) + xorps %xmm1,%xmm4 + movups %xmm3,16(%edi) + xorps %xmm0,%xmm5 + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) +L042ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movl 80(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_xts_encrypt +.private_extern _aes_hw_xts_encrypt +.align 4 +_aes_hw_xts_encrypt: +L_aes_hw_xts_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 36(%esp),%edx + movl 40(%esp),%esi + movl 240(%edx),%ecx + movups (%esi),%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L048enc1_loop_8: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L048enc1_loop_8 +.byte 102,15,56,221,209 + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl %esp,%ebp + subl $120,%esp + movl 240(%edx),%ecx + andl $-16,%esp + movl $135,96(%esp) + movl $0,100(%esp) + movl $1,104(%esp) + movl $0,108(%esp) + movl %eax,112(%esp) + movl %ebp,116(%esp) + movdqa %xmm2,%xmm1 + pxor %xmm0,%xmm0 + movdqa 96(%esp),%xmm3 + pcmpgtd %xmm1,%xmm0 + andl $-16,%eax + movl %edx,%ebp + movl %ecx,%ebx + subl $96,%eax + jc L049xts_enc_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp L050xts_enc_loop6 +.align 4,0x90 +L050xts_enc_loop6: + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,16(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,32(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,64(%esp) + paddq %xmm1,%xmm1 + movups (%ebp),%xmm0 + pand %xmm3,%xmm7 + movups (%esi),%xmm2 + pxor %xmm1,%xmm7 + movl %ebx,%ecx + movdqu 16(%esi),%xmm3 + xorps %xmm0,%xmm2 + movdqu 32(%esi),%xmm4 + pxor %xmm0,%xmm3 + movdqu 48(%esi),%xmm5 + pxor %xmm0,%xmm4 + movdqu 64(%esi),%xmm6 + pxor %xmm0,%xmm5 + movdqu 80(%esi),%xmm1 + pxor %xmm0,%xmm6 + leal 96(%esi),%esi + pxor (%esp),%xmm2 + movdqa %xmm7,80(%esp) + pxor %xmm1,%xmm7 + movups 16(%ebp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 +.byte 102,15,56,220,209 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 +.byte 102,15,56,220,217 + pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + call L_aesni_encrypt6_enter + movdqa 80(%esp),%xmm1 + pxor %xmm0,%xmm0 + xorps (%esp),%xmm2 + pcmpgtd %xmm1,%xmm0 + xorps 16(%esp),%xmm3 + movups %xmm2,(%edi) + xorps 32(%esp),%xmm4 + movups %xmm3,16(%edi) + xorps 48(%esp),%xmm5 + movups %xmm4,32(%edi) + xorps 64(%esp),%xmm6 + movups %xmm5,48(%edi) + xorps %xmm1,%xmm7 + movups %xmm6,64(%edi) + pshufd $19,%xmm0,%xmm2 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqa 96(%esp),%xmm3 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + subl $96,%eax + jnc L050xts_enc_loop6 + movl 240(%ebp),%ecx + movl %ebp,%edx + movl %ecx,%ebx +L049xts_enc_short: + addl $96,%eax + jz L051xts_enc_done6x + movdqa %xmm1,%xmm5 + cmpl $32,%eax + jb L052xts_enc_one + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + je L053xts_enc_two + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm6 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + cmpl $64,%eax + jb L054xts_enc_three + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm7 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + movdqa %xmm5,(%esp) + movdqa %xmm6,16(%esp) + je L055xts_enc_four + movdqa %xmm7,32(%esp) + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm7 + pxor %xmm1,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + pxor (%esp),%xmm2 + movdqu 48(%esi),%xmm5 + pxor 16(%esp),%xmm3 + movdqu 64(%esi),%xmm6 + pxor 32(%esp),%xmm4 + leal 80(%esi),%esi + pxor 48(%esp),%xmm5 + movdqa %xmm7,64(%esp) + pxor %xmm7,%xmm6 + call __aesni_encrypt6 + movaps 64(%esp),%xmm1 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps 32(%esp),%xmm4 + movups %xmm2,(%edi) + xorps 48(%esp),%xmm5 + movups %xmm3,16(%edi) + xorps %xmm1,%xmm6 + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + leal 80(%edi),%edi + jmp L056xts_enc_done +.align 4,0x90 +L052xts_enc_one: + movups (%esi),%xmm2 + leal 16(%esi),%esi + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L057enc1_loop_9: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L057enc1_loop_9 +.byte 102,15,56,221,209 + xorps %xmm5,%xmm2 + movups %xmm2,(%edi) + leal 16(%edi),%edi + movdqa %xmm5,%xmm1 + jmp L056xts_enc_done +.align 4,0x90 +L053xts_enc_two: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + leal 32(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + call __aesni_encrypt2 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + leal 32(%edi),%edi + movdqa %xmm6,%xmm1 + jmp L056xts_enc_done +.align 4,0x90 +L054xts_enc_three: + movaps %xmm1,%xmm7 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + leal 48(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + call __aesni_encrypt3 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + leal 48(%edi),%edi + movdqa %xmm7,%xmm1 + jmp L056xts_enc_done +.align 4,0x90 +L055xts_enc_four: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + xorps (%esp),%xmm2 + movups 48(%esi),%xmm5 + leal 64(%esi),%esi + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + xorps %xmm6,%xmm5 + call __aesni_encrypt4 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + xorps %xmm6,%xmm5 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + leal 64(%edi),%edi + movdqa %xmm6,%xmm1 + jmp L056xts_enc_done +.align 4,0x90 +L051xts_enc_done6x: + movl 112(%esp),%eax + andl $15,%eax + jz L058xts_enc_ret + movdqa %xmm1,%xmm5 + movl %eax,112(%esp) + jmp L059xts_enc_steal +.align 4,0x90 +L056xts_enc_done: + movl 112(%esp),%eax + pxor %xmm0,%xmm0 + andl $15,%eax + jz L058xts_enc_ret + pcmpgtd %xmm1,%xmm0 + movl %eax,112(%esp) + pshufd $19,%xmm0,%xmm5 + paddq %xmm1,%xmm1 + pand 96(%esp),%xmm5 + pxor %xmm1,%xmm5 +L059xts_enc_steal: + movzbl (%esi),%ecx + movzbl -16(%edi),%edx + leal 1(%esi),%esi + movb %cl,-16(%edi) + movb %dl,(%edi) + leal 1(%edi),%edi + subl $1,%eax + jnz L059xts_enc_steal + subl 112(%esp),%edi + movl %ebp,%edx + movl %ebx,%ecx + movups -16(%edi),%xmm2 + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L060enc1_loop_10: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L060enc1_loop_10 +.byte 102,15,56,221,209 + xorps %xmm5,%xmm2 + movups %xmm2,-16(%edi) +L058xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) + movl 116(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_xts_decrypt +.private_extern _aes_hw_xts_decrypt +.align 4 +_aes_hw_xts_decrypt: +L_aes_hw_xts_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 36(%esp),%edx + movl 40(%esp),%esi + movl 240(%edx),%ecx + movups (%esi),%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L061enc1_loop_11: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L061enc1_loop_11 +.byte 102,15,56,221,209 + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl %esp,%ebp + subl $120,%esp + andl $-16,%esp + xorl %ebx,%ebx + testl $15,%eax + setnz %bl + shll $4,%ebx + subl %ebx,%eax + movl $135,96(%esp) + movl $0,100(%esp) + movl $1,104(%esp) + movl $0,108(%esp) + movl %eax,112(%esp) + movl %ebp,116(%esp) + movl 240(%edx),%ecx + movl %edx,%ebp + movl %ecx,%ebx + movdqa %xmm2,%xmm1 + pxor %xmm0,%xmm0 + movdqa 96(%esp),%xmm3 + pcmpgtd %xmm1,%xmm0 + andl $-16,%eax + subl $96,%eax + jc L062xts_dec_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp L063xts_dec_loop6 +.align 4,0x90 +L063xts_dec_loop6: + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,16(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,32(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,64(%esp) + paddq %xmm1,%xmm1 + movups (%ebp),%xmm0 + pand %xmm3,%xmm7 + movups (%esi),%xmm2 + pxor %xmm1,%xmm7 + movl %ebx,%ecx + movdqu 16(%esi),%xmm3 + xorps %xmm0,%xmm2 + movdqu 32(%esi),%xmm4 + pxor %xmm0,%xmm3 + movdqu 48(%esi),%xmm5 + pxor %xmm0,%xmm4 + movdqu 64(%esi),%xmm6 + pxor %xmm0,%xmm5 + movdqu 80(%esi),%xmm1 + pxor %xmm0,%xmm6 + leal 96(%esi),%esi + pxor (%esp),%xmm2 + movdqa %xmm7,80(%esp) + pxor %xmm1,%xmm7 + movups 16(%ebp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 +.byte 102,15,56,222,209 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 +.byte 102,15,56,222,217 + pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + call L_aesni_decrypt6_enter + movdqa 80(%esp),%xmm1 + pxor %xmm0,%xmm0 + xorps (%esp),%xmm2 + pcmpgtd %xmm1,%xmm0 + xorps 16(%esp),%xmm3 + movups %xmm2,(%edi) + xorps 32(%esp),%xmm4 + movups %xmm3,16(%edi) + xorps 48(%esp),%xmm5 + movups %xmm4,32(%edi) + xorps 64(%esp),%xmm6 + movups %xmm5,48(%edi) + xorps %xmm1,%xmm7 + movups %xmm6,64(%edi) + pshufd $19,%xmm0,%xmm2 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqa 96(%esp),%xmm3 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + subl $96,%eax + jnc L063xts_dec_loop6 + movl 240(%ebp),%ecx + movl %ebp,%edx + movl %ecx,%ebx +L062xts_dec_short: + addl $96,%eax + jz L064xts_dec_done6x + movdqa %xmm1,%xmm5 + cmpl $32,%eax + jb L065xts_dec_one + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + je L066xts_dec_two + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm6 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + cmpl $64,%eax + jb L067xts_dec_three + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm7 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + movdqa %xmm5,(%esp) + movdqa %xmm6,16(%esp) + je L068xts_dec_four + movdqa %xmm7,32(%esp) + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm7 + pxor %xmm1,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + pxor (%esp),%xmm2 + movdqu 48(%esi),%xmm5 + pxor 16(%esp),%xmm3 + movdqu 64(%esi),%xmm6 + pxor 32(%esp),%xmm4 + leal 80(%esi),%esi + pxor 48(%esp),%xmm5 + movdqa %xmm7,64(%esp) + pxor %xmm7,%xmm6 + call __aesni_decrypt6 + movaps 64(%esp),%xmm1 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps 32(%esp),%xmm4 + movups %xmm2,(%edi) + xorps 48(%esp),%xmm5 + movups %xmm3,16(%edi) + xorps %xmm1,%xmm6 + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + leal 80(%edi),%edi + jmp L069xts_dec_done +.align 4,0x90 +L065xts_dec_one: + movups (%esi),%xmm2 + leal 16(%esi),%esi + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L070dec1_loop_12: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L070dec1_loop_12 +.byte 102,15,56,223,209 + xorps %xmm5,%xmm2 + movups %xmm2,(%edi) + leal 16(%edi),%edi + movdqa %xmm5,%xmm1 + jmp L069xts_dec_done +.align 4,0x90 +L066xts_dec_two: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + leal 32(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + call __aesni_decrypt2 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + leal 32(%edi),%edi + movdqa %xmm6,%xmm1 + jmp L069xts_dec_done +.align 4,0x90 +L067xts_dec_three: + movaps %xmm1,%xmm7 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + leal 48(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + call __aesni_decrypt3 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + leal 48(%edi),%edi + movdqa %xmm7,%xmm1 + jmp L069xts_dec_done +.align 4,0x90 +L068xts_dec_four: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + xorps (%esp),%xmm2 + movups 48(%esi),%xmm5 + leal 64(%esi),%esi + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + xorps %xmm6,%xmm5 + call __aesni_decrypt4 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + xorps %xmm6,%xmm5 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + leal 64(%edi),%edi + movdqa %xmm6,%xmm1 + jmp L069xts_dec_done +.align 4,0x90 +L064xts_dec_done6x: + movl 112(%esp),%eax + andl $15,%eax + jz L071xts_dec_ret + movl %eax,112(%esp) + jmp L072xts_dec_only_one_more +.align 4,0x90 +L069xts_dec_done: + movl 112(%esp),%eax + pxor %xmm0,%xmm0 + andl $15,%eax + jz L071xts_dec_ret + pcmpgtd %xmm1,%xmm0 + movl %eax,112(%esp) + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa 96(%esp),%xmm3 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 +L072xts_dec_only_one_more: + pshufd $19,%xmm0,%xmm5 + movdqa %xmm1,%xmm6 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm5 + pxor %xmm1,%xmm5 + movl %ebp,%edx + movl %ebx,%ecx + movups (%esi),%xmm2 + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L073dec1_loop_13: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L073dec1_loop_13 +.byte 102,15,56,223,209 + xorps %xmm5,%xmm2 + movups %xmm2,(%edi) +L074xts_dec_steal: + movzbl 16(%esi),%ecx + movzbl (%edi),%edx + leal 1(%esi),%esi + movb %cl,(%edi) + movb %dl,16(%edi) + leal 1(%edi),%edi + subl $1,%eax + jnz L074xts_dec_steal + subl 112(%esp),%edi + movl %ebp,%edx + movl %ebx,%ecx + movups (%edi),%xmm2 + xorps %xmm6,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L075dec1_loop_14: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L075dec1_loop_14 +.byte 102,15,56,223,209 + xorps %xmm6,%xmm2 + movups %xmm2,(%edi) +L071xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) + movl 116(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt +.align 4 +_aes_hw_cbc_encrypt: +L_aes_hw_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl %esp,%ebx + movl 24(%esp),%edi + subl $24,%ebx + movl 28(%esp),%eax + andl $-16,%ebx + movl 32(%esp),%edx + movl 36(%esp),%ebp + testl %eax,%eax + jz L076cbc_abort + cmpl $0,40(%esp) + xchgl %esp,%ebx + movups (%ebp),%xmm7 + movl 240(%edx),%ecx + movl %edx,%ebp + movl %ebx,16(%esp) + movl %ecx,%ebx + je L077cbc_decrypt + movaps %xmm7,%xmm2 + cmpl $16,%eax + jb L078cbc_enc_tail + subl $16,%eax + jmp L079cbc_enc_loop +.align 4,0x90 +L079cbc_enc_loop: + movups (%esi),%xmm7 + leal 16(%esi),%esi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm7 + leal 32(%edx),%edx + xorps %xmm7,%xmm2 +L080enc1_loop_15: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L080enc1_loop_15 +.byte 102,15,56,221,209 + movl %ebx,%ecx + movl %ebp,%edx + movups %xmm2,(%edi) + leal 16(%edi),%edi + subl $16,%eax + jnc L079cbc_enc_loop + addl $16,%eax + jnz L078cbc_enc_tail + movaps %xmm2,%xmm7 + pxor %xmm2,%xmm2 + jmp L081cbc_ret +L078cbc_enc_tail: + movl %eax,%ecx +.long 2767451785 + movl $16,%ecx + subl %eax,%ecx + xorl %eax,%eax +.long 2868115081 + leal -16(%edi),%edi + movl %ebx,%ecx + movl %edi,%esi + movl %ebp,%edx + jmp L079cbc_enc_loop +.align 4,0x90 +L077cbc_decrypt: + cmpl $80,%eax + jbe L082cbc_dec_tail + movaps %xmm7,(%esp) + subl $80,%eax + jmp L083cbc_dec_loop6_enter +.align 4,0x90 +L084cbc_dec_loop6: + movaps %xmm0,(%esp) + movups %xmm7,(%edi) + leal 16(%edi),%edi +L083cbc_dec_loop6_enter: + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + call __aesni_decrypt6 + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps (%esp),%xmm2 + xorps %xmm1,%xmm3 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm4 + movups 48(%esi),%xmm0 + xorps %xmm1,%xmm5 + movups 64(%esi),%xmm1 + xorps %xmm0,%xmm6 + movups 80(%esi),%xmm0 + xorps %xmm1,%xmm7 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + leal 96(%esi),%esi + movups %xmm4,32(%edi) + movl %ebx,%ecx + movups %xmm5,48(%edi) + movl %ebp,%edx + movups %xmm6,64(%edi) + leal 80(%edi),%edi + subl $96,%eax + ja L084cbc_dec_loop6 + movaps %xmm7,%xmm2 + movaps %xmm0,%xmm7 + addl $80,%eax + jle L085cbc_dec_clear_tail_collected + movups %xmm2,(%edi) + leal 16(%edi),%edi +L082cbc_dec_tail: + movups (%esi),%xmm2 + movaps %xmm2,%xmm6 + cmpl $16,%eax + jbe L086cbc_dec_one + movups 16(%esi),%xmm3 + movaps %xmm3,%xmm5 + cmpl $32,%eax + jbe L087cbc_dec_two + movups 32(%esi),%xmm4 + cmpl $48,%eax + jbe L088cbc_dec_three + movups 48(%esi),%xmm5 + cmpl $64,%eax + jbe L089cbc_dec_four + movups 64(%esi),%xmm6 + movaps %xmm7,(%esp) + movups (%esi),%xmm2 + xorps %xmm7,%xmm7 + call __aesni_decrypt6 + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps (%esp),%xmm2 + xorps %xmm1,%xmm3 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm4 + movups 48(%esi),%xmm0 + xorps %xmm1,%xmm5 + movups 64(%esi),%xmm7 + xorps %xmm0,%xmm6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 + leal 64(%edi),%edi + movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 + subl $80,%eax + jmp L090cbc_dec_tail_collected +.align 4,0x90 +L086cbc_dec_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L091dec1_loop_16: +.byte 102,15,56,222,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L091dec1_loop_16 +.byte 102,15,56,223,209 + xorps %xmm7,%xmm2 + movaps %xmm6,%xmm7 + subl $16,%eax + jmp L090cbc_dec_tail_collected +.align 4,0x90 +L087cbc_dec_two: + call __aesni_decrypt2 + xorps %xmm7,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + leal 16(%edi),%edi + movaps %xmm5,%xmm7 + subl $32,%eax + jmp L090cbc_dec_tail_collected +.align 4,0x90 +L088cbc_dec_three: + call __aesni_decrypt3 + xorps %xmm7,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm5,%xmm4 + movups %xmm2,(%edi) + movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 + movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 + leal 32(%edi),%edi + movups 32(%esi),%xmm7 + subl $48,%eax + jmp L090cbc_dec_tail_collected +.align 4,0x90 +L089cbc_dec_four: + call __aesni_decrypt4 + movups 16(%esi),%xmm1 + movups 32(%esi),%xmm0 + xorps %xmm7,%xmm2 + movups 48(%esi),%xmm7 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + xorps %xmm1,%xmm4 + movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 + xorps %xmm0,%xmm5 + movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 + leal 48(%edi),%edi + movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 + subl $64,%eax + jmp L090cbc_dec_tail_collected +.align 4,0x90 +L085cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +L090cbc_dec_tail_collected: + andl $15,%eax + jnz L092cbc_dec_tail_partial + movups %xmm2,(%edi) + pxor %xmm0,%xmm0 + jmp L081cbc_ret +.align 4,0x90 +L092cbc_dec_tail_partial: + movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 + movl $16,%ecx + movl %esp,%esi + subl %eax,%ecx +.long 2767451785 + movdqa %xmm2,(%esp) +L081cbc_ret: + movl 16(%esp),%esp + movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 + movups %xmm7,(%ebp) + pxor %xmm7,%xmm7 +L076cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_set_encrypt_key_base +.private_extern _aes_hw_set_encrypt_key_base +.align 4 +_aes_hw_set_encrypt_key_base: +L_aes_hw_set_encrypt_key_base_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L093pic_for_function_hit +L093pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+3-L093pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 8(%esp),%ecx + movl 12(%esp),%edx + pushl %ebx + call L094pic +L094pic: + popl %ebx + leal Lkey_const-L094pic(%ebx),%ebx + movups (%eax),%xmm0 + xorps %xmm4,%xmm4 + leal 16(%edx),%edx + cmpl $256,%ecx + je L09514rounds + cmpl $192,%ecx + je L09612rounds + cmpl $128,%ecx + jne L097bad_keybits +.align 4,0x90 +L09810rounds: + movl $9,%ecx + movups %xmm0,-16(%edx) +.byte 102,15,58,223,200,1 + call L099key_128_cold +.byte 102,15,58,223,200,2 + call L100key_128 +.byte 102,15,58,223,200,4 + call L100key_128 +.byte 102,15,58,223,200,8 + call L100key_128 +.byte 102,15,58,223,200,16 + call L100key_128 +.byte 102,15,58,223,200,32 + call L100key_128 +.byte 102,15,58,223,200,64 + call L100key_128 +.byte 102,15,58,223,200,128 + call L100key_128 +.byte 102,15,58,223,200,27 + call L100key_128 +.byte 102,15,58,223,200,54 + call L100key_128 + movups %xmm0,(%edx) + movl %ecx,80(%edx) + jmp L101good_key +.align 4,0x90 +L100key_128: + movups %xmm0,(%edx) + leal 16(%edx),%edx +L099key_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.align 4,0x90 +L09612rounds: + movq 16(%eax),%xmm2 + movl $11,%ecx + movups %xmm0,-16(%edx) +.byte 102,15,58,223,202,1 + call L102key_192a_cold +.byte 102,15,58,223,202,2 + call L103key_192b +.byte 102,15,58,223,202,4 + call L104key_192a +.byte 102,15,58,223,202,8 + call L103key_192b +.byte 102,15,58,223,202,16 + call L104key_192a +.byte 102,15,58,223,202,32 + call L103key_192b +.byte 102,15,58,223,202,64 + call L104key_192a +.byte 102,15,58,223,202,128 + call L103key_192b + movups %xmm0,(%edx) + movl %ecx,48(%edx) + jmp L101good_key +.align 4,0x90 +L104key_192a: + movups %xmm0,(%edx) + leal 16(%edx),%edx +.align 4,0x90 +L102key_192a_cold: + movaps %xmm2,%xmm5 +L105key_192b_warm: + shufps $16,%xmm0,%xmm4 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + pslldq $4,%xmm3 + xorps %xmm4,%xmm0 + pshufd $85,%xmm1,%xmm1 + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm3,%xmm2 + ret +.align 4,0x90 +L103key_192b: + movaps %xmm0,%xmm3 + shufps $68,%xmm0,%xmm5 + movups %xmm5,(%edx) + shufps $78,%xmm2,%xmm3 + movups %xmm3,16(%edx) + leal 32(%edx),%edx + jmp L105key_192b_warm +.align 4,0x90 +L09514rounds: + movups 16(%eax),%xmm2 + leal 16(%edx),%edx + movl $13,%ecx + movups %xmm0,-32(%edx) + movups %xmm2,-16(%edx) +.byte 102,15,58,223,202,1 + call L106key_256a_cold +.byte 102,15,58,223,200,1 + call L107key_256b +.byte 102,15,58,223,202,2 + call L108key_256a +.byte 102,15,58,223,200,2 + call L107key_256b +.byte 102,15,58,223,202,4 + call L108key_256a +.byte 102,15,58,223,200,4 + call L107key_256b +.byte 102,15,58,223,202,8 + call L108key_256a +.byte 102,15,58,223,200,8 + call L107key_256b +.byte 102,15,58,223,202,16 + call L108key_256a +.byte 102,15,58,223,200,16 + call L107key_256b +.byte 102,15,58,223,202,32 + call L108key_256a +.byte 102,15,58,223,200,32 + call L107key_256b +.byte 102,15,58,223,202,64 + call L108key_256a + movups %xmm0,(%edx) + movl %ecx,16(%edx) + xorl %eax,%eax + jmp L101good_key +.align 4,0x90 +L108key_256a: + movups %xmm2,(%edx) + leal 16(%edx),%edx +L106key_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.align 4,0x90 +L107key_256b: + movups %xmm0,(%edx) + leal 16(%edx),%edx + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + ret +L101good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + ret +.align 2,0x90 +L097bad_keybits: + pxor %xmm0,%xmm0 + movl $-2,%eax + popl %ebx + ret +.globl _aes_hw_set_encrypt_key_alt +.private_extern _aes_hw_set_encrypt_key_alt +.align 4 +_aes_hw_set_encrypt_key_alt: +L_aes_hw_set_encrypt_key_alt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L109pic_for_function_hit +L109pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+3-L109pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 8(%esp),%ecx + movl 12(%esp),%edx + pushl %ebx + call L110pic +L110pic: + popl %ebx + leal Lkey_const-L110pic(%ebx),%ebx + movups (%eax),%xmm0 + xorps %xmm4,%xmm4 + leal 16(%edx),%edx + cmpl $256,%ecx + je L11114rounds_alt + cmpl $192,%ecx + je L11212rounds_alt + cmpl $128,%ecx + jne L113bad_keybits +.align 4,0x90 +L11410rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +L115loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz L115loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp L116good_key +.align 4,0x90 +L11212rounds_alt: + movq 16(%eax),%xmm2 + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +L117loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz L117loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp L116good_key +.align 4,0x90 +L11114rounds_alt: + movups 16(%eax),%xmm2 + leal 16(%edx),%edx + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +L118loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz L119done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp L118loop_key256 +L119done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +L116good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + ret +.align 2,0x90 +L113bad_keybits: + pxor %xmm0,%xmm0 + movl $-2,%eax + popl %ebx + ret +.globl _aes_hw_encrypt_key_to_decrypt_key +.private_extern _aes_hw_encrypt_key_to_decrypt_key +.align 4 +_aes_hw_encrypt_key_to_decrypt_key: +L_aes_hw_encrypt_key_to_decrypt_key_begin: + movl 4(%esp),%edx + movl 240(%edx),%ecx + shll $4,%ecx + leal 16(%edx,%ecx,1),%eax + movups (%edx),%xmm0 + movups (%eax),%xmm1 + movups %xmm0,(%eax) + movups %xmm1,(%edx) + leal 16(%edx),%edx + leal -16(%eax),%eax +L120dec_key_inverse: + movups (%edx),%xmm0 + movups (%eax),%xmm1 +.byte 102,15,56,219,192 +.byte 102,15,56,219,201 + leal 16(%edx),%edx + leal -16(%eax),%eax + movups %xmm0,16(%eax) + movups %xmm1,-16(%edx) + cmpl %edx,%eax + ja L120dec_key_inverse + movups (%edx),%xmm0 +.byte 102,15,56,219,192 + movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + ret +.align 6,0x90 +Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 +.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 +.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 +.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 +.byte 115,108,46,111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-linux.S similarity index 93% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-linux.S index 8d6cf17d..0c76eb2d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -18,10 +17,10 @@ aes_hw_encrypt: #ifdef BORINGSSL_DISPATCH_TEST pushl %ebx pushl %edx - call .L000pic -.L000pic: + call .L000pic_for_function_hit +.L000pic_for_function_hit: popl %ebx - leal BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx + leal BORINGSSL_function_hit+1-.L000pic_for_function_hit(%ebx),%ebx movl $1,%edx movb %dl,(%ebx) popl %edx @@ -849,10 +848,10 @@ aes_hw_ctr32_encrypt_blocks: #ifdef BORINGSSL_DISPATCH_TEST pushl %ebx pushl %edx - call .L038pic -.L038pic: + call .L038pic_for_function_hit +.L038pic_for_function_hit: popl %ebx - leal BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx + leal BORINGSSL_function_hit+0-.L038pic_for_function_hit(%ebx),%ebx movl $1,%edx movb %dl,(%ebx) popl %edx @@ -2101,26 +2100,35 @@ aes_hw_cbc_encrypt: popl %ebp ret .size aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin -.hidden _aesni_set_encrypt_key -.type _aesni_set_encrypt_key,@function +.globl aes_hw_set_encrypt_key_base +.hidden aes_hw_set_encrypt_key_base +.type aes_hw_set_encrypt_key_base,@function .align 16 -_aesni_set_encrypt_key: - pushl %ebp +aes_hw_set_encrypt_key_base: +.L_aes_hw_set_encrypt_key_base_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L093pic_for_function_hit +.L093pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+3-.L093pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 8(%esp),%ecx + movl 12(%esp),%edx pushl %ebx - testl %eax,%eax - jz .L093bad_pointer - testl %edx,%edx - jz .L093bad_pointer call .L094pic .L094pic: popl %ebx leal .Lkey_const-.L094pic(%ebx),%ebx - leal OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 - movl 4(%ebp),%ebp leal 16(%edx),%edx - andl $268437504,%ebp cmpl $256,%ecx je .L09514rounds cmpl $192,%ecx @@ -2129,38 +2137,36 @@ _aesni_set_encrypt_key: jne .L097bad_keybits .align 16 .L09810rounds: - cmpl $268435456,%ebp - je .L09910rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call .L100key_128_cold + call .L099key_128_cold .byte 102,15,58,223,200,2 - call .L101key_128 + call .L100key_128 .byte 102,15,58,223,200,4 - call .L101key_128 + call .L100key_128 .byte 102,15,58,223,200,8 - call .L101key_128 + call .L100key_128 .byte 102,15,58,223,200,16 - call .L101key_128 + call .L100key_128 .byte 102,15,58,223,200,32 - call .L101key_128 + call .L100key_128 .byte 102,15,58,223,200,64 - call .L101key_128 + call .L100key_128 .byte 102,15,58,223,200,128 - call .L101key_128 + call .L100key_128 .byte 102,15,58,223,200,27 - call .L101key_128 + call .L100key_128 .byte 102,15,58,223,200,54 - call .L101key_128 + call .L100key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - jmp .L102good_key + jmp .L101good_key .align 16 -.L101key_128: +.L100key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -.L100key_128_cold: +.L099key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2169,91 +2175,37 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L09910rounds_alt: - movdqa (%ebx),%xmm5 - movl $8,%ecx - movdqa 32(%ebx),%xmm4 - movdqa %xmm0,%xmm2 - movdqu %xmm0,-16(%edx) -.L103loop_key128: -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - pslld $1,%xmm4 - leal 16(%edx),%edx - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - pxor %xmm2,%xmm0 - movdqu %xmm0,-16(%edx) - movdqa %xmm0,%xmm2 - decl %ecx - jnz .L103loop_key128 - movdqa 48(%ebx),%xmm4 -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - pslld $1,%xmm4 - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - pxor %xmm2,%xmm0 - movdqu %xmm0,(%edx) - movdqa %xmm0,%xmm2 -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - pxor %xmm2,%xmm0 - movdqu %xmm0,16(%edx) - movl $9,%ecx - movl %ecx,96(%edx) - jmp .L102good_key -.align 16 .L09612rounds: movq 16(%eax),%xmm2 - cmpl $268435456,%ebp - je .L10412rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call .L105key_192a_cold + call .L102key_192a_cold .byte 102,15,58,223,202,2 - call .L106key_192b + call .L103key_192b .byte 102,15,58,223,202,4 - call .L107key_192a + call .L104key_192a .byte 102,15,58,223,202,8 - call .L106key_192b + call .L103key_192b .byte 102,15,58,223,202,16 - call .L107key_192a + call .L104key_192a .byte 102,15,58,223,202,32 - call .L106key_192b + call .L103key_192b .byte 102,15,58,223,202,64 - call .L107key_192a + call .L104key_192a .byte 102,15,58,223,202,128 - call .L106key_192b + call .L103key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - jmp .L102good_key + jmp .L101good_key .align 16 -.L107key_192a: +.L104key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 -.L105key_192a_cold: +.L102key_192a_cold: movaps %xmm2,%xmm5 -.L108key_192b_warm: +.L105key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2267,90 +2219,56 @@ _aesni_set_encrypt_key: pxor %xmm3,%xmm2 ret .align 16 -.L106key_192b: +.L103key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp .L108key_192b_warm -.align 16 -.L10412rounds_alt: - movdqa 16(%ebx),%xmm5 - movdqa 32(%ebx),%xmm4 - movl $8,%ecx - movdqu %xmm0,-16(%edx) -.L109loop_key192: - movq %xmm2,(%edx) - movdqa %xmm2,%xmm1 -.byte 102,15,56,0,213 -.byte 102,15,56,221,212 - pslld $1,%xmm4 - leal 24(%edx),%edx - movdqa %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm3,%xmm0 - pshufd $255,%xmm0,%xmm3 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pxor %xmm2,%xmm0 - pxor %xmm3,%xmm2 - movdqu %xmm0,-16(%edx) - decl %ecx - jnz .L109loop_key192 - movl $11,%ecx - movl %ecx,32(%edx) - jmp .L102good_key + jmp .L105key_192b_warm .align 16 .L09514rounds: movups 16(%eax),%xmm2 leal 16(%edx),%edx - cmpl $268435456,%ebp - je .L11014rounds_alt movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call .L111key_256a_cold + call .L106key_256a_cold .byte 102,15,58,223,200,1 - call .L112key_256b + call .L107key_256b .byte 102,15,58,223,202,2 - call .L113key_256a + call .L108key_256a .byte 102,15,58,223,200,2 - call .L112key_256b + call .L107key_256b .byte 102,15,58,223,202,4 - call .L113key_256a + call .L108key_256a .byte 102,15,58,223,200,4 - call .L112key_256b + call .L107key_256b .byte 102,15,58,223,202,8 - call .L113key_256a + call .L108key_256a .byte 102,15,58,223,200,8 - call .L112key_256b + call .L107key_256b .byte 102,15,58,223,202,16 - call .L113key_256a + call .L108key_256a .byte 102,15,58,223,200,16 - call .L112key_256b + call .L107key_256b .byte 102,15,58,223,202,32 - call .L113key_256a + call .L108key_256a .byte 102,15,58,223,200,32 - call .L112key_256b + call .L107key_256b .byte 102,15,58,223,202,64 - call .L113key_256a + call .L108key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - jmp .L102good_key + jmp .L101good_key .align 16 -.L113key_256a: +.L108key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -.L111key_256a_cold: +.L106key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2359,7 +2277,7 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L112key_256b: +.L107key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2369,15 +2287,154 @@ _aesni_set_encrypt_key: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret +.L101good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + ret +.align 4 +.L097bad_keybits: + pxor %xmm0,%xmm0 + movl $-2,%eax + popl %ebx + ret +.size aes_hw_set_encrypt_key_base,.-.L_aes_hw_set_encrypt_key_base_begin +.globl aes_hw_set_encrypt_key_alt +.hidden aes_hw_set_encrypt_key_alt +.type aes_hw_set_encrypt_key_alt,@function +.align 16 +aes_hw_set_encrypt_key_alt: +.L_aes_hw_set_encrypt_key_alt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L109pic_for_function_hit +.L109pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+3-.L109pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 8(%esp),%ecx + movl 12(%esp),%edx + pushl %ebx + call .L110pic +.L110pic: + popl %ebx + leal .Lkey_const-.L110pic(%ebx),%ebx + movups (%eax),%xmm0 + xorps %xmm4,%xmm4 + leal 16(%edx),%edx + cmpl $256,%ecx + je .L11114rounds_alt + cmpl $192,%ecx + je .L11212rounds_alt + cmpl $128,%ecx + jne .L113bad_keybits .align 16 -.L11014rounds_alt: +.L11410rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +.L115loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz .L115loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp .L116good_key +.align 16 +.L11212rounds_alt: + movq 16(%eax),%xmm2 + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +.L117loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz .L117loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp .L116good_key +.align 16 +.L11114rounds_alt: + movups 16(%eax),%xmm2 + leal 16(%edx),%edx movdqa (%ebx),%xmm5 movdqa 32(%ebx),%xmm4 movl $7,%ecx movdqu %xmm0,-32(%edx) movdqa %xmm2,%xmm1 movdqu %xmm2,-16(%edx) -.L114loop_key256: +.L118loop_key256: .byte 102,15,56,0,213 .byte 102,15,56,221,212 movdqa %xmm0,%xmm3 @@ -2391,7 +2448,7 @@ _aesni_set_encrypt_key: pxor %xmm2,%xmm0 movdqu %xmm0,(%edx) decl %ecx - jz .L115done_key256 + jz .L119done_key256 pshufd $255,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 @@ -2406,11 +2463,11 @@ _aesni_set_encrypt_key: movdqu %xmm2,16(%edx) leal 32(%edx),%edx movdqa %xmm2,%xmm1 - jmp .L114loop_key256 -.L115done_key256: + jmp .L118loop_key256 +.L119done_key256: movl $13,%ecx movl %ecx,16(%edx) -.L102good_key: +.L116good_key: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -2419,60 +2476,23 @@ _aesni_set_encrypt_key: pxor %xmm5,%xmm5 xorl %eax,%eax popl %ebx - popl %ebp - ret -.align 4 -.L093bad_pointer: - movl $-1,%eax - popl %ebx - popl %ebp ret .align 4 -.L097bad_keybits: +.L113bad_keybits: pxor %xmm0,%xmm0 movl $-2,%eax popl %ebx - popl %ebp ret -.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key -.globl aes_hw_set_encrypt_key -.hidden aes_hw_set_encrypt_key -.type aes_hw_set_encrypt_key,@function -.align 16 -aes_hw_set_encrypt_key: -.L_aes_hw_set_encrypt_key_begin: -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call .L116pic -.L116pic: - popl %ebx - leal BORINGSSL_function_hit+3-.L116pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - movl 4(%esp),%eax - movl 8(%esp),%ecx - movl 12(%esp),%edx - call _aesni_set_encrypt_key - ret -.size aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin -.globl aes_hw_set_decrypt_key -.hidden aes_hw_set_decrypt_key -.type aes_hw_set_decrypt_key,@function -.align 16 -aes_hw_set_decrypt_key: -.L_aes_hw_set_decrypt_key_begin: - movl 4(%esp),%eax - movl 8(%esp),%ecx - movl 12(%esp),%edx - call _aesni_set_encrypt_key - movl 12(%esp),%edx +.size aes_hw_set_encrypt_key_alt,.-.L_aes_hw_set_encrypt_key_alt_begin +.globl aes_hw_encrypt_key_to_decrypt_key +.hidden aes_hw_encrypt_key_to_decrypt_key +.type aes_hw_encrypt_key_to_decrypt_key,@function +.align 16 +aes_hw_encrypt_key_to_decrypt_key: +.L_aes_hw_encrypt_key_to_decrypt_key_begin: + movl 4(%esp),%edx + movl 240(%edx),%ecx shll $4,%ecx - testl %eax,%eax - jnz .L117dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2480,7 +2500,7 @@ aes_hw_set_decrypt_key: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -.L118dec_key_inverse: +.L120dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2490,16 +2510,14 @@ aes_hw_set_decrypt_key: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja .L118dec_key_inverse + ja .L120dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 - xorl %eax,%eax -.L117dec_key_ret: ret -.size aes_hw_set_decrypt_key,.-.L_aes_hw_set_decrypt_key_begin +.size aes_hw_encrypt_key_to_decrypt_key,.-.L_aes_hw_encrypt_key_to_decrypt_key_begin .align 64 .Lkey_const: .long 202313229,202313229,202313229,202313229 @@ -2511,7 +2529,6 @@ aes_hw_set_decrypt_key: .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-apple.S similarity index 97% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-apple.S index dfe7d257..dc8e82d5 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -7,7 +6,6 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text - .globl _aes_hw_encrypt .private_extern _aes_hw_encrypt @@ -1907,76 +1905,63 @@ L$cbc_ret: ret -.globl _aes_hw_set_decrypt_key -.private_extern _aes_hw_set_decrypt_key +.globl _aes_hw_encrypt_key_to_decrypt_key +.private_extern _aes_hw_encrypt_key_to_decrypt_key .p2align 4 -_aes_hw_set_decrypt_key: +_aes_hw_encrypt_key_to_decrypt_key: _CET_ENDBR -.byte 0x48,0x83,0xEC,0x08 - call __aesni_set_encrypt_key + movl 240(%rdi),%esi shll $4,%esi - testl %eax,%eax - jnz L$dec_key_ret - leaq 16(%rdx,%rsi,1),%rdi - movups (%rdx),%xmm0 - movups (%rdi),%xmm1 - movups %xmm0,(%rdi) - movups %xmm1,(%rdx) - leaq 16(%rdx),%rdx - leaq -16(%rdi),%rdi + leaq 16(%rdi,%rsi,1),%rdx + + movups (%rdi),%xmm0 + movups (%rdx),%xmm1 + movups %xmm0,(%rdx) + movups %xmm1,(%rdi) + leaq 16(%rdi),%rdi + leaq -16(%rdx),%rdx L$dec_key_inverse: - movups (%rdx),%xmm0 - movups (%rdi),%xmm1 + movups (%rdi),%xmm0 + movups (%rdx),%xmm1 .byte 102,15,56,219,192 .byte 102,15,56,219,201 - leaq 16(%rdx),%rdx - leaq -16(%rdi),%rdi - movups %xmm0,16(%rdi) - movups %xmm1,-16(%rdx) - cmpq %rdx,%rdi + leaq 16(%rdi),%rdi + leaq -16(%rdx),%rdx + movups %xmm0,16(%rdx) + movups %xmm1,-16(%rdi) + cmpq %rdi,%rdx ja L$dec_key_inverse - movups (%rdx),%xmm0 + movups (%rdi),%xmm0 .byte 102,15,56,219,192 pxor %xmm1,%xmm1 - movups %xmm0,(%rdi) + movups %xmm0,(%rdx) pxor %xmm0,%xmm0 -L$dec_key_ret: - addq $8,%rsp - ret -L$SEH_end_set_decrypt_key: -.globl _aes_hw_set_encrypt_key -.private_extern _aes_hw_set_encrypt_key +.globl _aes_hw_set_encrypt_key_base +.private_extern _aes_hw_set_encrypt_key_base .p2align 4 -_aes_hw_set_encrypt_key: -__aesni_set_encrypt_key: +_aes_hw_set_encrypt_key_base: + _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb $1,_BORINGSSL_function_hit+3(%rip) #endif -.byte 0x48,0x83,0xEC,0x08 + subq $8,%rsp + - movq $-1,%rax - testq %rdi,%rdi - jz L$enc_key_ret - testq %rdx,%rdx - jz L$enc_key_ret movups (%rdi),%xmm0 xorps %xmm4,%xmm4 - leaq _OPENSSL_ia32cap_P(%rip),%r10 - movl 4(%r10),%r10d - andl $268437504,%r10d leaq 16(%rdx),%rax cmpl $256,%esi je L$14rounds @@ -1987,8 +1972,6 @@ _CET_ENDBR L$10rounds: movl $9,%esi - cmpl $268435456,%r10d - je L$10rounds_alt movups %xmm0,(%rdx) .byte 102,15,58,223,200,1 @@ -2017,7 +2000,193 @@ L$10rounds: jmp L$enc_key_ret .p2align 4 -L$10rounds_alt: +L$12rounds: + movq 16(%rdi),%xmm2 + movl $11,%esi + + movups %xmm0,(%rdx) +.byte 102,15,58,223,202,1 + call L$key_expansion_192a_cold +.byte 102,15,58,223,202,2 + call L$key_expansion_192b +.byte 102,15,58,223,202,4 + call L$key_expansion_192a +.byte 102,15,58,223,202,8 + call L$key_expansion_192b +.byte 102,15,58,223,202,16 + call L$key_expansion_192a +.byte 102,15,58,223,202,32 + call L$key_expansion_192b +.byte 102,15,58,223,202,64 + call L$key_expansion_192a +.byte 102,15,58,223,202,128 + call L$key_expansion_192b + movups %xmm0,(%rax) + movl %esi,48(%rax) + xorq %rax,%rax + jmp L$enc_key_ret + +.p2align 4 +L$14rounds: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + + movups %xmm0,(%rdx) + movups %xmm2,16(%rdx) +.byte 102,15,58,223,202,1 + call L$key_expansion_256a_cold +.byte 102,15,58,223,200,1 + call L$key_expansion_256b +.byte 102,15,58,223,202,2 + call L$key_expansion_256a +.byte 102,15,58,223,200,2 + call L$key_expansion_256b +.byte 102,15,58,223,202,4 + call L$key_expansion_256a +.byte 102,15,58,223,200,4 + call L$key_expansion_256b +.byte 102,15,58,223,202,8 + call L$key_expansion_256a +.byte 102,15,58,223,200,8 + call L$key_expansion_256b +.byte 102,15,58,223,202,16 + call L$key_expansion_256a +.byte 102,15,58,223,200,16 + call L$key_expansion_256b +.byte 102,15,58,223,202,32 + call L$key_expansion_256a +.byte 102,15,58,223,200,32 + call L$key_expansion_256b +.byte 102,15,58,223,202,64 + call L$key_expansion_256a + movups %xmm0,(%rax) + movl %esi,16(%rax) + xorq %rax,%rax + jmp L$enc_key_ret + +.p2align 4 +L$bad_keybits: + movq $-2,%rax +L$enc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp + + ret + + + +.p2align 4 +L$key_expansion_128: + + movups %xmm0,(%rax) + leaq 16(%rax),%rax +L$key_expansion_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret + + +.p2align 4 +L$key_expansion_192a: + + movups %xmm0,(%rax) + leaq 16(%rax),%rax +L$key_expansion_192a_cold: + movaps %xmm2,%xmm5 +L$key_expansion_192b_warm: + shufps $16,%xmm0,%xmm4 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + pslldq $4,%xmm3 + xorps %xmm4,%xmm0 + pshufd $85,%xmm1,%xmm1 + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm3,%xmm2 + ret + + +.p2align 4 +L$key_expansion_192b: + + movaps %xmm0,%xmm3 + shufps $68,%xmm0,%xmm5 + movups %xmm5,(%rax) + shufps $78,%xmm2,%xmm3 + movups %xmm3,16(%rax) + leaq 32(%rax),%rax + jmp L$key_expansion_192b_warm + + +.p2align 4 +L$key_expansion_256a: + + movups %xmm2,(%rax) + leaq 16(%rax),%rax +L$key_expansion_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret + + +.p2align 4 +L$key_expansion_256b: + + movups %xmm0,(%rax) + leaq 16(%rax),%rax + + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + ret + + + +.globl _aes_hw_set_encrypt_key_alt +.private_extern _aes_hw_set_encrypt_key_alt + +.p2align 4 +_aes_hw_set_encrypt_key_alt: + + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,_BORINGSSL_function_hit+3(%rip) +#endif + subq $8,%rsp + + + + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + leaq 16(%rdx),%rax + cmpl $256,%esi + je L$14rounds_alt + cmpl $192,%esi + je L$12rounds_alt + cmpl $128,%esi + jne L$bad_keybits_alt + + movl $9,%esi movdqa L$key_rotate(%rip),%xmm5 movl $8,%r10d movdqa L$key_rcon1(%rip),%xmm4 @@ -2081,39 +2250,12 @@ L$oop_key128: movl %esi,96(%rax) xorl %eax,%eax - jmp L$enc_key_ret + jmp L$enc_key_ret_alt .p2align 4 -L$12rounds: +L$12rounds_alt: movq 16(%rdi),%xmm2 movl $11,%esi - cmpl $268435456,%r10d - je L$12rounds_alt - - movups %xmm0,(%rdx) -.byte 102,15,58,223,202,1 - call L$key_expansion_192a_cold -.byte 102,15,58,223,202,2 - call L$key_expansion_192b -.byte 102,15,58,223,202,4 - call L$key_expansion_192a -.byte 102,15,58,223,202,8 - call L$key_expansion_192b -.byte 102,15,58,223,202,16 - call L$key_expansion_192a -.byte 102,15,58,223,202,32 - call L$key_expansion_192b -.byte 102,15,58,223,202,64 - call L$key_expansion_192a -.byte 102,15,58,223,202,128 - call L$key_expansion_192b - movups %xmm0,(%rax) - movl %esi,48(%rax) - xorq %rax,%rax - jmp L$enc_key_ret - -.p2align 4 -L$12rounds_alt: movdqa L$key_rotate192(%rip),%xmm5 movdqa L$key_rcon1(%rip),%xmm4 movl $8,%r10d @@ -2151,51 +2293,13 @@ L$oop_key192: movl %esi,32(%rax) xorl %eax,%eax - jmp L$enc_key_ret + jmp L$enc_key_ret_alt .p2align 4 -L$14rounds: +L$14rounds_alt: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax - cmpl $268435456,%r10d - je L$14rounds_alt - - movups %xmm0,(%rdx) - movups %xmm2,16(%rdx) -.byte 102,15,58,223,202,1 - call L$key_expansion_256a_cold -.byte 102,15,58,223,200,1 - call L$key_expansion_256b -.byte 102,15,58,223,202,2 - call L$key_expansion_256a -.byte 102,15,58,223,200,2 - call L$key_expansion_256b -.byte 102,15,58,223,202,4 - call L$key_expansion_256a -.byte 102,15,58,223,200,4 - call L$key_expansion_256b -.byte 102,15,58,223,202,8 - call L$key_expansion_256a -.byte 102,15,58,223,200,8 - call L$key_expansion_256b -.byte 102,15,58,223,202,16 - call L$key_expansion_256a -.byte 102,15,58,223,200,16 - call L$key_expansion_256b -.byte 102,15,58,223,202,32 - call L$key_expansion_256a -.byte 102,15,58,223,200,32 - call L$key_expansion_256b -.byte 102,15,58,223,202,64 - call L$key_expansion_256a - movups %xmm0,(%rax) - movl %esi,16(%rax) - xorq %rax,%rax - jmp L$enc_key_ret - -.p2align 4 -L$14rounds_alt: movdqa L$key_rotate(%rip),%xmm5 movdqa L$key_rcon1(%rip),%xmm4 movl $7,%r10d @@ -2246,12 +2350,12 @@ L$oop_key256: L$done_key256: movl %esi,16(%rax) xorl %eax,%eax - jmp L$enc_key_ret + jmp L$enc_key_ret_alt .p2align 4 -L$bad_keybits: +L$bad_keybits_alt: movq $-2,%rax -L$enc_key_ret: +L$enc_key_ret_alt: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -2262,76 +2366,6 @@ L$enc_key_ret: ret -L$SEH_end_set_encrypt_key: - -.p2align 4 -L$key_expansion_128: - movups %xmm0,(%rax) - leaq 16(%rax),%rax -L$key_expansion_128_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - ret - -.p2align 4 -L$key_expansion_192a: - movups %xmm0,(%rax) - leaq 16(%rax),%rax -L$key_expansion_192a_cold: - movaps %xmm2,%xmm5 -L$key_expansion_192b_warm: - shufps $16,%xmm0,%xmm4 - movdqa %xmm2,%xmm3 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - pslldq $4,%xmm3 - xorps %xmm4,%xmm0 - pshufd $85,%xmm1,%xmm1 - pxor %xmm3,%xmm2 - pxor %xmm1,%xmm0 - pshufd $255,%xmm0,%xmm3 - pxor %xmm3,%xmm2 - ret - -.p2align 4 -L$key_expansion_192b: - movaps %xmm0,%xmm3 - shufps $68,%xmm0,%xmm5 - movups %xmm5,(%rax) - shufps $78,%xmm2,%xmm3 - movups %xmm3,16(%rax) - leaq 32(%rax),%rax - jmp L$key_expansion_192b_warm - -.p2align 4 -L$key_expansion_256a: - movups %xmm2,(%rax) - leaq 16(%rax),%rax -L$key_expansion_256a_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - ret - -.p2align 4 -L$key_expansion_256b: - movups %xmm0,(%rax) - leaq 16(%rax),%rax - - shufps $16,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $140,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $170,%xmm1,%xmm1 - xorps %xmm1,%xmm2 - ret .section __DATA,__const @@ -2359,7 +2393,6 @@ L$key_rcon1b: .p2align 6 .text #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-linux.S similarity index 96% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-linux.S index e86b7b84..eb34d936 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -7,8 +6,6 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P .globl aes_hw_encrypt .hidden aes_hw_encrypt .type aes_hw_encrypt,@function @@ -1909,76 +1906,63 @@ _CET_ENDBR ret .cfi_endproc .size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt -.globl aes_hw_set_decrypt_key -.hidden aes_hw_set_decrypt_key -.type aes_hw_set_decrypt_key,@function +.globl aes_hw_encrypt_key_to_decrypt_key +.hidden aes_hw_encrypt_key_to_decrypt_key +.type aes_hw_encrypt_key_to_decrypt_key,@function .align 16 -aes_hw_set_decrypt_key: +aes_hw_encrypt_key_to_decrypt_key: .cfi_startproc _CET_ENDBR -.byte 0x48,0x83,0xEC,0x08 -.cfi_adjust_cfa_offset 8 - call __aesni_set_encrypt_key + + movl 240(%rdi),%esi shll $4,%esi - testl %eax,%eax - jnz .Ldec_key_ret - leaq 16(%rdx,%rsi,1),%rdi - movups (%rdx),%xmm0 - movups (%rdi),%xmm1 - movups %xmm0,(%rdi) - movups %xmm1,(%rdx) - leaq 16(%rdx),%rdx - leaq -16(%rdi),%rdi + leaq 16(%rdi,%rsi,1),%rdx + + movups (%rdi),%xmm0 + movups (%rdx),%xmm1 + movups %xmm0,(%rdx) + movups %xmm1,(%rdi) + leaq 16(%rdi),%rdi + leaq -16(%rdx),%rdx .Ldec_key_inverse: - movups (%rdx),%xmm0 - movups (%rdi),%xmm1 + movups (%rdi),%xmm0 + movups (%rdx),%xmm1 .byte 102,15,56,219,192 .byte 102,15,56,219,201 - leaq 16(%rdx),%rdx - leaq -16(%rdi),%rdi - movups %xmm0,16(%rdi) - movups %xmm1,-16(%rdx) - cmpq %rdx,%rdi + leaq 16(%rdi),%rdi + leaq -16(%rdx),%rdx + movups %xmm0,16(%rdx) + movups %xmm1,-16(%rdi) + cmpq %rdi,%rdx ja .Ldec_key_inverse - movups (%rdx),%xmm0 + movups (%rdi),%xmm0 .byte 102,15,56,219,192 pxor %xmm1,%xmm1 - movups %xmm0,(%rdi) + movups %xmm0,(%rdx) pxor %xmm0,%xmm0 -.Ldec_key_ret: - addq $8,%rsp -.cfi_adjust_cfa_offset -8 ret .cfi_endproc -.LSEH_end_set_decrypt_key: -.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key -.globl aes_hw_set_encrypt_key -.hidden aes_hw_set_encrypt_key -.type aes_hw_set_encrypt_key,@function -.align 16 -aes_hw_set_encrypt_key: -__aesni_set_encrypt_key: +.size aes_hw_encrypt_key_to_decrypt_key,.-aes_hw_encrypt_key_to_decrypt_key +.globl aes_hw_set_encrypt_key_base +.hidden aes_hw_set_encrypt_key_base +.type aes_hw_set_encrypt_key_base,@function +.align 16 +aes_hw_set_encrypt_key_base: .cfi_startproc + _CET_ENDBR #ifdef BORINGSSL_DISPATCH_TEST movb $1,BORINGSSL_function_hit+3(%rip) #endif -.byte 0x48,0x83,0xEC,0x08 + subq $8,%rsp .cfi_adjust_cfa_offset 8 - movq $-1,%rax - testq %rdi,%rdi - jz .Lenc_key_ret - testq %rdx,%rdx - jz .Lenc_key_ret + movups (%rdi),%xmm0 xorps %xmm4,%xmm4 - leaq OPENSSL_ia32cap_P(%rip),%r10 - movl 4(%r10),%r10d - andl $268437504,%r10d leaq 16(%rdx),%rax cmpl $256,%esi je .L14rounds @@ -1989,8 +1973,6 @@ _CET_ENDBR .L10rounds: movl $9,%esi - cmpl $268435456,%r10d - je .L10rounds_alt movups %xmm0,(%rdx) .byte 102,15,58,223,200,1 @@ -2019,7 +2001,193 @@ _CET_ENDBR jmp .Lenc_key_ret .align 16 -.L10rounds_alt: +.L12rounds: + movq 16(%rdi),%xmm2 + movl $11,%esi + + movups %xmm0,(%rdx) +.byte 102,15,58,223,202,1 + call .Lkey_expansion_192a_cold +.byte 102,15,58,223,202,2 + call .Lkey_expansion_192b +.byte 102,15,58,223,202,4 + call .Lkey_expansion_192a +.byte 102,15,58,223,202,8 + call .Lkey_expansion_192b +.byte 102,15,58,223,202,16 + call .Lkey_expansion_192a +.byte 102,15,58,223,202,32 + call .Lkey_expansion_192b +.byte 102,15,58,223,202,64 + call .Lkey_expansion_192a +.byte 102,15,58,223,202,128 + call .Lkey_expansion_192b + movups %xmm0,(%rax) + movl %esi,48(%rax) + xorq %rax,%rax + jmp .Lenc_key_ret + +.align 16 +.L14rounds: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + + movups %xmm0,(%rdx) + movups %xmm2,16(%rdx) +.byte 102,15,58,223,202,1 + call .Lkey_expansion_256a_cold +.byte 102,15,58,223,200,1 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,2 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,2 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,4 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,4 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,8 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,8 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,16 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,16 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,32 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,32 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,64 + call .Lkey_expansion_256a + movups %xmm0,(%rax) + movl %esi,16(%rax) + xorq %rax,%rax + jmp .Lenc_key_ret + +.align 16 +.Lbad_keybits: + movq $-2,%rax +.Lenc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp +.cfi_adjust_cfa_offset -8 + ret +.cfi_endproc + + +.align 16 +.Lkey_expansion_128: +.cfi_startproc + movups %xmm0,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.cfi_endproc + +.align 16 +.Lkey_expansion_192a: +.cfi_startproc + movups %xmm0,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_192a_cold: + movaps %xmm2,%xmm5 +.Lkey_expansion_192b_warm: + shufps $16,%xmm0,%xmm4 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + pslldq $4,%xmm3 + xorps %xmm4,%xmm0 + pshufd $85,%xmm1,%xmm1 + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm3,%xmm2 + ret +.cfi_endproc + +.align 16 +.Lkey_expansion_192b: +.cfi_startproc + movaps %xmm0,%xmm3 + shufps $68,%xmm0,%xmm5 + movups %xmm5,(%rax) + shufps $78,%xmm2,%xmm3 + movups %xmm3,16(%rax) + leaq 32(%rax),%rax + jmp .Lkey_expansion_192b_warm +.cfi_endproc + +.align 16 +.Lkey_expansion_256a: +.cfi_startproc + movups %xmm2,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.cfi_endproc + +.align 16 +.Lkey_expansion_256b: +.cfi_startproc + movups %xmm0,(%rax) + leaq 16(%rax),%rax + + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + ret +.cfi_endproc +.size aes_hw_set_encrypt_key_base,.-aes_hw_set_encrypt_key_base + +.globl aes_hw_set_encrypt_key_alt +.hidden aes_hw_set_encrypt_key_alt +.type aes_hw_set_encrypt_key_alt,@function +.align 16 +aes_hw_set_encrypt_key_alt: +.cfi_startproc + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit+3(%rip) +#endif + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + leaq 16(%rdx),%rax + cmpl $256,%esi + je .L14rounds_alt + cmpl $192,%esi + je .L12rounds_alt + cmpl $128,%esi + jne .Lbad_keybits_alt + + movl $9,%esi movdqa .Lkey_rotate(%rip),%xmm5 movl $8,%r10d movdqa .Lkey_rcon1(%rip),%xmm4 @@ -2083,39 +2251,12 @@ _CET_ENDBR movl %esi,96(%rax) xorl %eax,%eax - jmp .Lenc_key_ret + jmp .Lenc_key_ret_alt .align 16 -.L12rounds: +.L12rounds_alt: movq 16(%rdi),%xmm2 movl $11,%esi - cmpl $268435456,%r10d - je .L12rounds_alt - - movups %xmm0,(%rdx) -.byte 102,15,58,223,202,1 - call .Lkey_expansion_192a_cold -.byte 102,15,58,223,202,2 - call .Lkey_expansion_192b -.byte 102,15,58,223,202,4 - call .Lkey_expansion_192a -.byte 102,15,58,223,202,8 - call .Lkey_expansion_192b -.byte 102,15,58,223,202,16 - call .Lkey_expansion_192a -.byte 102,15,58,223,202,32 - call .Lkey_expansion_192b -.byte 102,15,58,223,202,64 - call .Lkey_expansion_192a -.byte 102,15,58,223,202,128 - call .Lkey_expansion_192b - movups %xmm0,(%rax) - movl %esi,48(%rax) - xorq %rax,%rax - jmp .Lenc_key_ret - -.align 16 -.L12rounds_alt: movdqa .Lkey_rotate192(%rip),%xmm5 movdqa .Lkey_rcon1(%rip),%xmm4 movl $8,%r10d @@ -2153,51 +2294,13 @@ _CET_ENDBR movl %esi,32(%rax) xorl %eax,%eax - jmp .Lenc_key_ret + jmp .Lenc_key_ret_alt .align 16 -.L14rounds: +.L14rounds_alt: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax - cmpl $268435456,%r10d - je .L14rounds_alt - - movups %xmm0,(%rdx) - movups %xmm2,16(%rdx) -.byte 102,15,58,223,202,1 - call .Lkey_expansion_256a_cold -.byte 102,15,58,223,200,1 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,2 - call .Lkey_expansion_256a -.byte 102,15,58,223,200,2 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,4 - call .Lkey_expansion_256a -.byte 102,15,58,223,200,4 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,8 - call .Lkey_expansion_256a -.byte 102,15,58,223,200,8 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,16 - call .Lkey_expansion_256a -.byte 102,15,58,223,200,16 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,32 - call .Lkey_expansion_256a -.byte 102,15,58,223,200,32 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,64 - call .Lkey_expansion_256a - movups %xmm0,(%rax) - movl %esi,16(%rax) - xorq %rax,%rax - jmp .Lenc_key_ret - -.align 16 -.L14rounds_alt: movdqa .Lkey_rotate(%rip),%xmm5 movdqa .Lkey_rcon1(%rip),%xmm4 movl $7,%r10d @@ -2248,12 +2351,12 @@ _CET_ENDBR .Ldone_key256: movl %esi,16(%rax) xorl %eax,%eax - jmp .Lenc_key_ret + jmp .Lenc_key_ret_alt .align 16 -.Lbad_keybits: +.Lbad_keybits_alt: movq $-2,%rax -.Lenc_key_ret: +.Lenc_key_ret_alt: pxor %xmm0,%xmm0 pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 @@ -2264,78 +2367,8 @@ _CET_ENDBR .cfi_adjust_cfa_offset -8 ret .cfi_endproc -.LSEH_end_set_encrypt_key: - -.align 16 -.Lkey_expansion_128: - movups %xmm0,(%rax) - leaq 16(%rax),%rax -.Lkey_expansion_128_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - ret - -.align 16 -.Lkey_expansion_192a: - movups %xmm0,(%rax) - leaq 16(%rax),%rax -.Lkey_expansion_192a_cold: - movaps %xmm2,%xmm5 -.Lkey_expansion_192b_warm: - shufps $16,%xmm0,%xmm4 - movdqa %xmm2,%xmm3 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - pslldq $4,%xmm3 - xorps %xmm4,%xmm0 - pshufd $85,%xmm1,%xmm1 - pxor %xmm3,%xmm2 - pxor %xmm1,%xmm0 - pshufd $255,%xmm0,%xmm3 - pxor %xmm3,%xmm2 - ret - -.align 16 -.Lkey_expansion_192b: - movaps %xmm0,%xmm3 - shufps $68,%xmm0,%xmm5 - movups %xmm5,(%rax) - shufps $78,%xmm2,%xmm3 - movups %xmm3,16(%rax) - leaq 32(%rax),%rax - jmp .Lkey_expansion_192b_warm -.align 16 -.Lkey_expansion_256a: - movups %xmm2,(%rax) - leaq 16(%rax),%rax -.Lkey_expansion_256a_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - ret - -.align 16 -.Lkey_expansion_256b: - movups %xmm0,(%rax) - leaq 16(%rax),%rax - - shufps $16,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $140,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $170,%xmm1,%xmm1 - xorps %xmm1,%xmm2 - ret -.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key -.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key +.size aes_hw_set_encrypt_key_alt,.-aes_hw_set_encrypt_key_alt .section .rodata .align 64 .Lbswap_mask: @@ -2361,7 +2394,6 @@ _CET_ENDBR .align 64 .text #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv7-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-linux.linux.arm.S rename to Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv7-linux.S index f38e85e4..7b3aa3c0 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-linux.linux.arm.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv7-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -28,11 +27,6 @@ .align 5 aes_hw_set_encrypt_key: .Lenc_key: - mov r3,#-1 - cmp r0,#0 - beq .Lenc_key_abort - cmp r2,#0 - beq .Lenc_key_abort mov r3,#-2 cmp r1,#128 blt .Lenc_key_abort @@ -789,7 +783,6 @@ aes_hw_ctr32_encrypt_blocks: .size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) -#endif // defined(__arm__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-apple.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-apple.S index 71e56359..8f9bf08f 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -30,11 +29,6 @@ Lenc_key: AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 - mov x3,#-1 - cmp x0,#0 - b.eq Lenc_key_abort - cmp x2,#0 - b.eq Lenc_key_abort mov x3,#-2 cmp w1,#128 b.lt Lenc_key_abort @@ -791,7 +785,6 @@ Lctr32_done: #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-linux.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-linux.S index 7f98e4bc..a79876a1 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -30,11 +29,6 @@ aes_hw_set_encrypt_key: AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 - mov x3,#-1 - cmp x0,#0 - b.eq .Lenc_key_abort - cmp x2,#0 - b.eq .Lenc_key_abort mov x3,#-2 cmp w1,#128 b.lt .Lenc_key_abort @@ -791,7 +785,6 @@ aes_hw_ctr32_encrypt_blocks: .size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-win.S new file mode 100644 index 00000000..eda354ee --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-win.S @@ -0,0 +1,803 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include + +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv8-a+crypto +.section .rodata +.align 5 +Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl aes_hw_set_encrypt_key + +.def aes_hw_set_encrypt_key + .type 32 +.endef +.align 5 +aes_hw_set_encrypt_key: +Lenc_key: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-2 + cmp w1,#128 + b.lt Lenc_key_abort + cmp w1,#256 + b.gt Lenc_key_abort + tst w1,#0x3f + b.ne Lenc_key_abort + + adrp x3,Lrcon + add x3,x3,:lo12:Lrcon + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt Loop128 + b.eq L192 + b L256 + +.align 4 +Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b Ldone + +.align 4 +L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 // borrow v6.16b + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b // adjust the mask + +Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.8b},[x2],#8 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b Ldone + +.align 4 +L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b Loop256 + +Ldone: + str w12,[x2] + mov x3,#0 + +Lenc_key_abort: + mov x0,x3 // return value + ldr x29,[sp],#16 + ret + + +.globl aes_hw_set_decrypt_key + +.def aes_hw_set_decrypt_key + .type 32 +.endef +.align 5 +aes_hw_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl Lenc_key + + cmp x0,#0 + b.ne Ldec_key_abort + + sub x2,x2,#240 // restore original x2 + mov x4,#-16 + add x0,x2,x12,lsl#4 // end of key schedule + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 // return value +Ldec_key_abort: + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl aes_hw_encrypt + +.def aes_hw_encrypt + .type 32 +.endef +.align 5 +aes_hw_encrypt: + AARCH64_VALID_CALL_TARGET + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +Loop_enc: + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aese v2.16b,v1.16b + aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt Loop_enc + + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret + +.globl aes_hw_decrypt + +.def aes_hw_decrypt + .type 32 +.endef +.align 5 +aes_hw_decrypt: + AARCH64_VALID_CALL_TARGET + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +Loop_dec: + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aesd v2.16b,v1.16b + aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt Loop_dec + + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret + +.globl aes_hw_cbc_encrypt + +.def aes_hw_cbc_encrypt + .type 32 +.endef +.align 5 +aes_hw_cbc_encrypt: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + subs x2,x2,#16 + mov x8,#16 + b.lo Lcbc_abort + csel x8,xzr,x8,eq + + cmp w5,#0 // en- or decrypting? + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v6.16b},[x4] + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq Lcbc_dec + + cmp w5,#2 + eor v0.16b,v0.16b,v6.16b + eor v5.16b,v16.16b,v7.16b + b.eq Lcbc_enc128 + + ld1 {v2.4s,v3.4s},[x7] + add x7,x3,#16 + add x6,x3,#16*4 + add x12,x3,#16*5 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + add x14,x3,#16*6 + add x3,x3,#16*7 + b Lenter_cbc_enc + +.align 4 +Loop_cbc_enc: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +Lenter_cbc_enc: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x6] + cmp w5,#4 + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x12] + b.eq Lcbc_enc192 + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x14] + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3] + nop + +Lcbc_enc192: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x7] // re-pre-load rndkey[1] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs Loop_cbc_enc + + st1 {v6.16b},[x1],#16 + b Lcbc_done + +.align 5 +Lcbc_enc128: + ld1 {v2.4s,v3.4s},[x7] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + b Lenter_cbc_enc128 +Loop_cbc_enc128: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +Lenter_cbc_enc128: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs Loop_cbc_enc128 + + st1 {v6.16b},[x1],#16 + b Lcbc_done +.align 5 +Lcbc_dec: + ld1 {v18.16b},[x0],#16 + subs x2,x2,#32 // bias + add w6,w5,#2 + orr v3.16b,v0.16b,v0.16b + orr v1.16b,v0.16b,v0.16b + orr v19.16b,v18.16b,v18.16b + b.lo Lcbc_dec_tail + + orr v1.16b,v18.16b,v18.16b + ld1 {v18.16b},[x0],#16 + orr v2.16b,v0.16b,v0.16b + orr v3.16b,v1.16b,v1.16b + orr v19.16b,v18.16b,v18.16b + +Loop3x_cbc_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_cbc_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v18.16b + // are loaded with last "words" + orr v6.16b,v19.16b,v19.16b + mov x7,x3 + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + ld1 {v2.16b},[x0],#16 + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + ld1 {v19.16b},[x0],#16 + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + eor v18.16b,v18.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[x1],#16 + orr v0.16b,v2.16b,v2.16b + st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v18.16b},[x1],#16 + orr v18.16b,v19.16b,v19.16b + b.hs Loop3x_cbc_dec + + cmn x2,#0x30 + b.eq Lcbc_done + nop + +Lcbc_dec_tail: + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Lcbc_dec_tail + + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + cmn x2,#0x20 + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + eor v5.16b,v6.16b,v7.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + b.eq Lcbc_dec_one + eor v5.16b,v5.16b,v1.16b + eor v17.16b,v17.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + b Lcbc_done + +Lcbc_dec_one: + eor v5.16b,v5.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + +Lcbc_done: + st1 {v6.16b},[x4] +Lcbc_abort: + ldr x29,[sp],#16 + ret + +.globl aes_hw_ctr32_encrypt_blocks + +.def aes_hw_ctr32_encrypt_blocks + .type 32 +.endef +.align 5 +aes_hw_ctr32_encrypt_blocks: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 + csel x12,xzr,x12,lo + + // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are + // affected by silicon errata #1742098 [0] and #1655431 [1], + // respectively, where the second instruction of an aese/aesmc + // instruction pair may execute twice if an interrupt is taken right + // after the first instruction consumes an input register of which a + // single 32-bit lane has been updated the last time it was modified. + // + // This function uses a counter in one 32-bit lane. The vmov lines + // could write to v1.16b and v18.16b directly, but that trips this bugs. + // We write to v6.16b and copy to the final register as a workaround. + // + // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice + // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice +#ifndef __AARCH64EB__ + rev w8, w8 +#endif + add w10, w8, #1 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v6.s[3],w10 + add w8, w8, #2 + orr v1.16b,v6.16b,v6.16b + b.ls Lctr32_tail + rev w12, w8 + mov v6.s[3],w12 + sub x2,x2,#3 // bias + orr v18.16b,v6.16b,v6.16b + b Loop3x_ctr32 + +.align 4 +Loop3x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + add w9,w8,#1 + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + rev w9,w9 + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work + // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in + // 32-bit mode. See the comment above. + eor v19.16b,v19.16b,v7.16b + mov v6.s[3], w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + orr v0.16b,v6.16b,v6.16b + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + mov v6.s[3], w10 + rev w12,w8 + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + orr v1.16b,v6.16b,v6.16b + mov v6.s[3], w12 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + orr v18.16b,v6.16b,v6.16b + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs Loop3x_ctr32 + + adds x2,x2,#3 + b.eq Lctr32_done + cmp x2,#1 + mov x12,#16 + csel x12,xzr,x12,eq + +Lctr32_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq Lctr32_done + st1 {v3.16b},[x1] + +Lctr32_done: + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-apple.S index e6a85f0c..5202753a 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1555,7 +1554,6 @@ Ldec_blocks_less_than_1: // blocks left <= 1 #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-linux.S index 8ff6cd7f..a53c6074 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1555,7 +1554,6 @@ aes_gcm_dec_kernel: .size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-win.S new file mode 100644 index 00000000..cad80f35 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-win.S @@ -0,0 +1,1564 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include +#if __ARM_MAX_ARCH__ >= 8 + +.arch armv8-a+crypto +.text +.globl aes_gcm_enc_kernel + +.def aes_gcm_enc_kernel + .type 32 +.endef +.align 4 +aes_gcm_enc_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + add x4, x0, x1, lsr #3 // end_input_ptr + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + ldr q25, [x8, #112] // load rk7 + add x5, x5, x0 + lsr x12, x11, #32 + fmov d2, x10 // CTR block 2 + orr w11, w11, w11 + rev w12, w12 // rev_ctr32 + fmov d1, x10 // CTR block 1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + add w12, w12, #1 // increment rev_ctr32 + rev w9, w12 // CTR block 1 + fmov d3, x10 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 1 + add w12, w12, #1 // CTR block 1 + ldr q19, [x8, #16] // load rk1 + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + ldr q20, [x8, #32] // load rk2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + orr x9, x11, x9, lsl #32 // CTR block 3 + fmov v3.d[1], x9 // CTR block 3 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q21, [x8, #48] // load rk3 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q24, [x8, #96] // load rk6 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q23, [x8, #80] // load rk5 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ldr q14, [x6, #48] // load h3l | h3h + ext v14.16b, v14.16b, v14.16b, #8 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q22, [x8, #64] // load rk4 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ldr q13, [x6, #32] // load h2l | h2h + ext v13.16b, v13.16b, v13.16b, #8 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q15, [x6, #80] // load h4l | h4h + ext v15.16b, v15.16b, v15.16b, #8 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ldr q29, [x8, #176] // load rk11 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ldr q26, [x8, #128] // load rk8 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + add w12, w12, #1 // CTR block 3 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + ldr q27, [x8, #144] // load rk9 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + ldr q12, [x6] // load h1l | h1h + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + ldr q28, [x8, #160] // load rk10 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + b.lt Lenc_finish_first_blocks // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + b.eq Lenc_finish_first_blocks // branch if AES-192 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +Lenc_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v2.16b, v31.16b // AES block 2 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + eor v16.16b, v16.16b, v8.16b // h2k | h1k + b.ge Lenc_tail // handle tail + + ldp x19, x20, [x0, #16] // AES block 1 - load plaintext + rev w9, w12 // CTR block 4 + ldp x6, x7, [x0, #0] // AES block 0 - load plaintext + ldp x23, x24, [x0, #48] // AES block 3 - load plaintext + ldp x21, x22, [x0, #32] // AES block 2 - load plaintext + add x0, x0, #64 // AES input_ptr update + eor x19, x19, x13 // AES block 1 - round N low + eor x20, x20, x14 // AES block 1 - round N high + fmov d5, x19 // AES block 1 - mov low + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + eor x24, x24, x14 // AES block 3 - round N high + fmov d4, x6 // AES block 0 - mov low + cmp x0, x5 // check if we have <= 8 blocks + fmov v4.d[1], x7 // AES block 0 - mov high + eor x23, x23, x13 // AES block 3 - round N low + eor x21, x21, x13 // AES block 2 - round N low + fmov v5.d[1], x20 // AES block 1 - mov high + fmov d6, x21 // AES block 2 - mov low + add w12, w12, #1 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov d7, x23 // AES block 3 - mov low + eor x22, x22, x14 // AES block 2 - round N high + fmov v6.d[1], x22 // AES block 2 - mov high + eor v4.16b, v4.16b, v0.16b // AES block 0 - result + fmov d0, x10 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + eor v5.16b, v5.16b, v1.16b // AES block 1 - result + fmov d1, x10 // CTR block 5 + orr x9, x11, x9, lsl #32 // CTR block 5 + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + st1 { v4.16b}, [x2], #16 // AES block 0 - store result + fmov v7.d[1], x24 // AES block 3 - mov high + orr x9, x11, x9, lsl #32 // CTR block 6 + eor v6.16b, v6.16b, v2.16b // AES block 2 - result + st1 { v5.16b}, [x2], #16 // AES block 1 - store result + add w12, w12, #1 // CTR block 6 + fmov d2, x10 // CTR block 6 + fmov v2.d[1], x9 // CTR block 6 + st1 { v6.16b}, [x2], #16 // AES block 2 - store result + rev w9, w12 // CTR block 7 + orr x9, x11, x9, lsl #32 // CTR block 7 + eor v7.16b, v7.16b, v3.16b // AES block 3 - result + st1 { v7.16b}, [x2], #16 // AES block 3 - store result + b.ge Lenc_prepretail // do prepretail + +Lenc_main_loop: // main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d3, x10 // CTR block 4k+3 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + fmov v3.d[1], x9 // CTR block 4k+3 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + eor v4.16b, v4.16b, v11.16b // PRE 1 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x23, x23, x13 // AES block 4k+7 - round N low + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + eor x22, x22, x14 // AES block 4k+6 - round N high + mov d8, v4.d[1] // GHASH block 4k - mid + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + mov d4, v7.d[1] // GHASH block 4k+3 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor x19, x19, x13 // AES block 4k+5 - round N low + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + eor x21, x21, x13 // AES block 4k+6 - round N low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + movi v8.8b, #0xc2 + pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + fmov d5, x19 // AES block 4k+5 - mov low + ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext + b.lt Lenc_main_loop_continue // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Lenc_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Lenc_main_loop_continue: + shl d8, d8, #56 // mod_constant + eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid + add w12, w12, #1 // CTR block 4k+3 + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + add x0, x0, #64 // AES input_ptr update + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + rev w9, w12 // CTR block 4k+8 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor x6, x6, x13 // AES block 4k+4 - round N low + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + eor x7, x7, x14 // AES block 4k+4 - round N high + fmov d4, x6 // AES block 4k+4 - mov low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid + eor x20, x20, x14 // AES block 4k+5 - round N high + eor x24, x24, x14 // AES block 4k+7 - round N high + add w12, w12, #1 // CTR block 4k+8 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + fmov v4.d[1], x7 // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + fmov d7, x23 // AES block 4k+7 - mov low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + fmov v5.d[1], x20 // AES block 4k+5 - mov high + fmov d6, x21 // AES block 4k+6 - mov low + cmp x0, x5 // LOOP CONTROL + fmov v6.d[1], x22 // AES block 4k+6 - mov high + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + rev w9, w12 // CTR block 4k+9 + add w12, w12, #1 // CTR block 4k+9 + eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result + fmov d1, x10 // CTR block 4k+9 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + fmov v1.d[1], x9 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + rev w9, w12 // CTR block 4k+10 + st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + eor v11.16b, v11.16b, v9.16b // MODULO - fold into low + fmov v7.d[1], x24 // AES block 4k+7 - mov high + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result + fmov d2, x10 // CTR block 4k+10 + st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result + fmov v2.d[1], x9 // CTR block 4k+10 + rev w9, w12 // CTR block 4k+11 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + orr x9, x11, x9, lsl #32 // CTR block 4k+11 + eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result + st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result + b.lt Lenc_main_loop + +Lenc_prepretail: // PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov d3, x10 // CTR block 4k+3 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) + fmov v3.d[1], x9 // CTR block 4k+3 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + add w12, w12, #1 // CTR block 4k+3 + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + mov d4, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid + pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v10.16b, v10.16b, v9.16b // karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v11.16b + b.lt Lenc_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + b.eq Lenc_finish_prepretail // branch if AES-192 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + +Lenc_finish_prepretail: + eor v10.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v9.16b + pmull v4.1q, v10.1d, v8.1d + ext v10.16b, v10.16b, v10.16b, #8 + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor v11.16b, v11.16b, v4.16b + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b + +Lenc_tail: // TAIL + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + cmp x5, #48 + fmov d4, x6 // AES block 4k+4 - mov low + fmov v4.d[1], x7 // AES block 4k+4 - mov high + eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result + b.gt Lenc_blocks_more_than_3 + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + movi v9.8b, #0 + sub w12, w12, #1 + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt Lenc_blocks_more_than_2 + mov v3.16b, v1.16b + sub w12, w12, #1 + cmp x5, #16 + b.gt Lenc_blocks_more_than_1 + sub w12, w12, #1 + b Lenc_blocks_less_than_1 +Lenc_blocks_more_than_3: // blocks left > 3 + st1 { v5.16b}, [x2], #16 // AES final-3 block - store result + ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-3 block + eor x6, x6, x13 // AES final-2 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor x7, x7, x14 // AES final-2 block - round N high + mov d22, v4.d[1] // GHASH final-3 block - mid + fmov d5, x6 // AES final-2 block - mov low + fmov v5.d[1], x7 // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + mov d10, v17.d[1] // GHASH final-3 block - mid + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b // AES final-2 block - result +Lenc_blocks_more_than_2: // blocks left > 2 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-2 block + eor x6, x6, x13 // AES final-1 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + fmov d5, x6 // AES final-1 block - mov low + eor x7, x7, x14 // AES final-1 block - round N high + fmov v5.d[1], x7 // AES final-1 block - mov high + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + eor v5.16b, v5.16b, v2.16b // AES final-1 block - result + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid +Lenc_blocks_more_than_1: // blocks left > 1 + st1 { v5.16b}, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ldp x6, x7, [x0], #16 // AES final block - load input low & high + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + eor x6, x6, x13 // AES final block - round N low + mov d22, v4.d[1] // GHASH final-1 block - mid + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor x7, x7, x14 // AES final block - round N high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + fmov d5, x6 // AES final block - mov low + fmov v5.d[1], x7 // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + eor v5.16b, v5.16b, v3.16b // AES final block - result + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low +Lenc_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored + mvn x14, xzr // rkN_h = 0xffffffffffffffff + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + fmov d0, x6 // ctr0b is mask for last block + fmov v0.d[1], x7 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + mov d8, v4.d[1] // GHASH final block - mid + rev w9, w12 + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + str w9, [x16, #12] // store the updated counter + st1 { v5.16b}, [x2] // store all 16B + eor v11.16b, v11.16b, v9.16b // MODULO - fold into low + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl aes_gcm_dec_kernel + +.def aes_gcm_dec_kernel + .type 32 +.endef +.align 4 +aes_gcm_dec_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ldr q26, [x8, #128] // load rk8 + sub x5, x5, #1 // byte_len - 1 + ldr q25, [x8, #112] // load rk7 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x4, x0, x1, lsr #3 // end_input_ptr + ldr q24, [x8, #96] // load rk6 + lsr x12, x11, #32 + ldr q23, [x8, #80] // load rk5 + orr w11, w11, w11 + ldr q21, [x8, #48] // load rk3 + add x5, x5, x0 + rev w12, w12 // rev_ctr32 + add w12, w12, #1 // increment rev_ctr32 + fmov d3, x10 // CTR block 3 + rev w9, w12 // CTR block 1 + add w12, w12, #1 // CTR block 1 + fmov d1, x10 // CTR block 1 + orr x9, x11, x9, lsl #32 // CTR block 1 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + fmov d2, x10 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 3 + ldr q18, [x8, #0] // load rk0 + fmov v3.d[1], x9 // CTR block 3 + add w12, w12, #1 // CTR block 3 + ldr q22, [x8, #64] // load rk4 + ldr q19, [x8, #16] // load rk1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldr q14, [x6, #48] // load h3l | h3h + ext v14.16b, v14.16b, v14.16b, #8 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldr q15, [x6, #80] // load h4l | h4h + ext v15.16b, v15.16b, v15.16b, #8 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q13, [x6, #32] // load h2l | h2h + ext v13.16b, v13.16b, v13.16b, #8 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q20, [x8, #32] // load rk2 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q27, [x8, #144] // load rk9 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q12, [x6] // load h1l | h1h + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q28, [x8, #160] // load rk10 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + ldr q29, [x8, #176] // load rk11 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + b.lt Ldec_finish_first_blocks // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + b.eq Ldec_finish_first_blocks // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +Ldec_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + eor v16.16b, v16.16b, v8.16b // h2k | h1k + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + b.ge Ldec_tail // handle tail + + ldr q4, [x0, #0] // AES block 0 - load ciphertext + ldr q5, [x0, #16] // AES block 1 - load ciphertext + rev w9, w12 // CTR block 4 + eor v0.16b, v4.16b, v0.16b // AES block 0 - result + eor v1.16b, v5.16b, v1.16b // AES block 1 - result + rev64 v5.16b, v5.16b // GHASH block 1 + ldr q7, [x0, #48] // AES block 3 - load ciphertext + mov x7, v0.d[1] // AES block 0 - mov high + mov x6, v0.d[0] // AES block 0 - mov low + rev64 v4.16b, v4.16b // GHASH block 0 + add w12, w12, #1 // CTR block 4 + fmov d0, x10 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + mov x19, v1.d[0] // AES block 1 - mov low + orr x9, x11, x9, lsl #32 // CTR block 5 + mov x20, v1.d[1] // AES block 1 - mov high + eor x7, x7, x14 // AES block 0 - round N high + eor x6, x6, x13 // AES block 0 - round N low + stp x6, x7, [x2], #16 // AES block 0 - store result + fmov d1, x10 // CTR block 5 + ldr q6, [x0, #32] // AES block 2 - load ciphertext + add x0, x0, #64 // AES input_ptr update + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + add w12, w12, #1 // CTR block 6 + eor x19, x19, x13 // AES block 1 - round N low + orr x9, x11, x9, lsl #32 // CTR block 6 + eor x20, x20, x14 // AES block 1 - round N high + stp x19, x20, [x2], #16 // AES block 1 - store result + eor v2.16b, v6.16b, v2.16b // AES block 2 - result + cmp x0, x5 // check if we have <= 8 blocks + b.ge Ldec_prepretail // do prepretail + +Ldec_main_loop: // main loop start + mov x21, v2.d[0] // AES block 4k+2 - mov low + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev w9, w12 // CTR block 4k+7 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x23, v3.d[0] // AES block 4k+3 - mov low + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov v3.d[1], x9 // CTR block 4k+7 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor x22, x22, x14 // AES block 4k+2 - round N high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x21, x21, x13 // AES block 4k+2 - round N low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor x23, x23, x13 // AES block 4k+3 - round N low + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor x24, x24, x14 // AES block 4k+3 - round N high + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + add w12, w12, #1 // CTR block 4k+7 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + rev w9, w12 // CTR block 4k+8 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + add w12, w12, #1 // CTR block 4k+8 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + movi v8.8b, #0xc2 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + b.lt Ldec_main_loop_continue // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Ldec_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_main_loop_continue: + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext + eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext + ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext + mov x7, v0.d[1] // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + add x0, x0, #64 // AES input_ptr update + mov x6, v0.d[0] // AES block 4k+4 - mov low + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result + rev w9, w12 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + cmp x0, x5 // LOOP CONTROL + add w12, w12, #1 // CTR block 4k+9 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + mov x20, v1.d[1] // AES block 4k+5 - mov high + eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + mov x19, v1.d[0] // AES block 4k+5 - mov low + fmov d1, x10 // CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + fmov v1.d[1], x9 // CTR block 4k+9 + rev w9, w12 // CTR block 4k+10 + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + rev64 v5.16b, v5.16b // GHASH block 4k+5 + eor x20, x20, x14 // AES block 4k+5 - round N high + stp x6, x7, [x2], #16 // AES block 4k+4 - store result + eor x19, x19, x13 // AES block 4k+5 - round N low + stp x19, x20, [x2], #16 // AES block 4k+5 - store result + rev64 v4.16b, v4.16b // GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + b.lt Ldec_main_loop + +Ldec_prepretail: // PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + mov x21, v2.d[0] // AES block 4k+2 - mov low + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + rev w9, w12 // CTR block 4k+7 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + mov x23, v3.d[0] // AES block 4k+3 - mov low + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + fmov v3.d[1], x9 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + b.lt Ldec_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + b.eq Ldec_finish_prepretail // branch if AES-192 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_finish_prepretail: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor x22, x22, x14 // AES block 4k+2 - round N high + eor x23, x23, x13 // AES block 4k+3 - round N low + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + add w12, w12, #1 // CTR block 4k+7 + eor x21, x21, x13 // AES block 4k+2 - round N low + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor x24, x24, x14 // AES block 4k+3 - round N high + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + +Ldec_tail: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result + mov x6, v0.d[0] // AES block 4k+4 - mov low + mov x7, v0.d[1] // AES block 4k+4 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + cmp x5, #48 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + b.gt Ldec_blocks_more_than_3 + sub w12, w12, #1 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + cmp x5, #32 + movi v9.8b, #0 + mov v2.16b, v1.16b + b.gt Ldec_blocks_more_than_2 + sub w12, w12, #1 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Ldec_blocks_more_than_1 + sub w12, w12, #1 + b Ldec_blocks_less_than_1 +Ldec_blocks_more_than_3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +Ldec_blocks_more_than_2: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +Ldec_blocks_more_than_1: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +Ldec_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + rev w9, w12 + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + str w9, [x16, #12] // store the updated counter + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/armv4-mont-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-linux.linux.arm.S rename to Sources/CCryptoBoringSSL/gen/bcm/armv4-mont-linux.S index bbec47b3..d321b10f 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-linux.linux.arm.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/armv4-mont-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -939,7 +938,6 @@ bn_mul8x_mont_neon: .byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) -#endif // defined(__arm__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-apple.S index 82b43294..594fdbde 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1425,7 +1424,6 @@ Lmul4x_done: .align 2 .align 4 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-linux.S index 5d56f5b2..03137764 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1425,7 +1424,6 @@ __bn_mul4x_mont: .align 2 .align 4 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-win.S b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-win.S new file mode 100644 index 00000000..f12bf106 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-win.S @@ -0,0 +1,1436 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include + +.text + +.globl bn_mul_mont + +.def bn_mul_mont + .type 32 +.endef +.align 5 +bn_mul_mont: + AARCH64_SIGN_LINK_REGISTER + tst x5,#7 + b.eq __bn_sqr8x_mont + tst x5,#3 + b.eq __bn_mul4x_mont +Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,L1st_skip + +L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,L1st + +L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,Linner_skip + +Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,Linner + +Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.def __bn_sqr8x_mont + .type 32 +.endef +.align 5 +__bn_sqr8x_mont: + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to + // only from bn_mul_mont which has already signed the return address. + cmp x1,x2 + b.ne __bn_mul4x_mont +Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b Lsqr8x_zero_start + +Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewinded ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_mul + +.align 4 +Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b Lsqr8x_outer_loop + +.align 4 +Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewinded np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_tail + +.align 4 +Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b Lsqr8x_done + +.align 4 +Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret + +.def __bn_mul4x_mont + .type 32 +.endef +.align 5 +__bn_mul4x_mont: + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to + // only from bn_mul_mont or __bn_mul8x_mont which have already signed the + // return address. + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_reduction + + cbz x10,Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewinded x1 + cbz x10,Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_1st_tail + +.align 5 +Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_tail + + sub x11,x3,x5 // rewinded np? + adc x0,x0,xzr + cbz x10,Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_tail + +.align 4 +Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b Loop_mul4x_reduction + +.align 4 +Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b Lmul4x_done + +.align 4 +Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret + +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/gen/bcm/bn-586-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/bn-586-apple.S new file mode 100644 index 00000000..41d93acb --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/bn-586-apple.S @@ -0,0 +1,536 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _bn_mul_add_words +.private_extern _bn_mul_add_words +.align 4 +_bn_mul_add_words: +L_bn_mul_add_words_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 + jmp L000maw_sse2_entry +.align 4,0x90 +L001maw_sse2_unrolled: + movd (%eax),%mm3 + paddq %mm3,%mm1 + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + movd 4(%edx),%mm4 + pmuludq %mm0,%mm4 + movd 8(%edx),%mm6 + pmuludq %mm0,%mm6 + movd 12(%edx),%mm7 + pmuludq %mm0,%mm7 + paddq %mm2,%mm1 + movd 4(%eax),%mm3 + paddq %mm4,%mm3 + movd 8(%eax),%mm5 + paddq %mm6,%mm5 + movd 12(%eax),%mm4 + paddq %mm4,%mm7 + movd %mm1,(%eax) + movd 16(%edx),%mm2 + pmuludq %mm0,%mm2 + psrlq $32,%mm1 + movd 20(%edx),%mm4 + pmuludq %mm0,%mm4 + paddq %mm3,%mm1 + movd 24(%edx),%mm6 + pmuludq %mm0,%mm6 + movd %mm1,4(%eax) + psrlq $32,%mm1 + movd 28(%edx),%mm3 + addl $32,%edx + pmuludq %mm0,%mm3 + paddq %mm5,%mm1 + movd 16(%eax),%mm5 + paddq %mm5,%mm2 + movd %mm1,8(%eax) + psrlq $32,%mm1 + paddq %mm7,%mm1 + movd 20(%eax),%mm5 + paddq %mm5,%mm4 + movd %mm1,12(%eax) + psrlq $32,%mm1 + paddq %mm2,%mm1 + movd 24(%eax),%mm5 + paddq %mm5,%mm6 + movd %mm1,16(%eax) + psrlq $32,%mm1 + paddq %mm4,%mm1 + movd 28(%eax),%mm5 + paddq %mm5,%mm3 + movd %mm1,20(%eax) + psrlq $32,%mm1 + paddq %mm6,%mm1 + movd %mm1,24(%eax) + psrlq $32,%mm1 + paddq %mm3,%mm1 + movd %mm1,28(%eax) + leal 32(%eax),%eax + psrlq $32,%mm1 + subl $8,%ecx + jz L002maw_sse2_exit +L000maw_sse2_entry: + testl $4294967288,%ecx + jnz L001maw_sse2_unrolled +.align 2,0x90 +L003maw_sse2_loop: + movd (%edx),%mm2 + movd (%eax),%mm3 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm3,%mm1 + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz L003maw_sse2_loop +L002maw_sse2_exit: + movd %mm1,%eax + emms + ret + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _bn_mul_words +.private_extern _bn_mul_words +.align 4 +_bn_mul_words: +L_bn_mul_words_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 +.align 4,0x90 +L004mw_sse2_loop: + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz L004mw_sse2_loop + movd %mm1,%eax + emms + ret + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _bn_sqr_words +.private_extern _bn_sqr_words +.align 4 +_bn_sqr_words: +L_bn_sqr_words_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx +.align 4,0x90 +L005sqr_sse2_loop: + movd (%edx),%mm0 + pmuludq %mm0,%mm0 + leal 4(%edx),%edx + movq %mm0,(%eax) + subl $1,%ecx + leal 8(%eax),%eax + jnz L005sqr_sse2_loop + emms + ret + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _bn_div_words +.private_extern _bn_div_words +.align 4 +_bn_div_words: +L_bn_div_words_begin: + movl 4(%esp),%edx + movl 8(%esp),%eax + movl 12(%esp),%ecx + divl %ecx + ret +.globl _bn_add_words +.private_extern _bn_add_words +.align 4 +_bn_add_words: +L_bn_add_words_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + movl 20(%esp),%ebx + movl 24(%esp),%esi + movl 28(%esp),%edi + movl 32(%esp),%ebp + xorl %eax,%eax + andl $4294967288,%ebp + jz L006aw_finish +L007aw_loop: + # Round 0 + movl (%esi),%ecx + movl (%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,(%ebx) + # Round 1 + movl 4(%esi),%ecx + movl 4(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,4(%ebx) + # Round 2 + movl 8(%esi),%ecx + movl 8(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,8(%ebx) + # Round 3 + movl 12(%esi),%ecx + movl 12(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,12(%ebx) + # Round 4 + movl 16(%esi),%ecx + movl 16(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,16(%ebx) + # Round 5 + movl 20(%esi),%ecx + movl 20(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,20(%ebx) + # Round 6 + movl 24(%esi),%ecx + movl 24(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) + # Round 7 + movl 28(%esi),%ecx + movl 28(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,28(%ebx) + + addl $32,%esi + addl $32,%edi + addl $32,%ebx + subl $8,%ebp + jnz L007aw_loop +L006aw_finish: + movl 32(%esp),%ebp + andl $7,%ebp + jz L008aw_end + # Tail Round 0 + movl (%esi),%ecx + movl (%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,(%ebx) + jz L008aw_end + # Tail Round 1 + movl 4(%esi),%ecx + movl 4(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,4(%ebx) + jz L008aw_end + # Tail Round 2 + movl 8(%esi),%ecx + movl 8(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,8(%ebx) + jz L008aw_end + # Tail Round 3 + movl 12(%esi),%ecx + movl 12(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,12(%ebx) + jz L008aw_end + # Tail Round 4 + movl 16(%esi),%ecx + movl 16(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,16(%ebx) + jz L008aw_end + # Tail Round 5 + movl 20(%esi),%ecx + movl 20(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,20(%ebx) + jz L008aw_end + # Tail Round 6 + movl 24(%esi),%ecx + movl 24(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) +L008aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _bn_sub_words +.private_extern _bn_sub_words +.align 4 +_bn_sub_words: +L_bn_sub_words_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + movl 20(%esp),%ebx + movl 24(%esp),%esi + movl 28(%esp),%edi + movl 32(%esp),%ebp + xorl %eax,%eax + andl $4294967288,%ebp + jz L009aw_finish +L010aw_loop: + # Round 0 + movl (%esi),%ecx + movl (%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,(%ebx) + # Round 1 + movl 4(%esi),%ecx + movl 4(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,4(%ebx) + # Round 2 + movl 8(%esi),%ecx + movl 8(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,8(%ebx) + # Round 3 + movl 12(%esi),%ecx + movl 12(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,12(%ebx) + # Round 4 + movl 16(%esi),%ecx + movl 16(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,16(%ebx) + # Round 5 + movl 20(%esi),%ecx + movl 20(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,20(%ebx) + # Round 6 + movl 24(%esi),%ecx + movl 24(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) + # Round 7 + movl 28(%esi),%ecx + movl 28(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,28(%ebx) + + addl $32,%esi + addl $32,%edi + addl $32,%ebx + subl $8,%ebp + jnz L010aw_loop +L009aw_finish: + movl 32(%esp),%ebp + andl $7,%ebp + jz L011aw_end + # Tail Round 0 + movl (%esi),%ecx + movl (%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,(%ebx) + jz L011aw_end + # Tail Round 1 + movl 4(%esi),%ecx + movl 4(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,4(%ebx) + jz L011aw_end + # Tail Round 2 + movl 8(%esi),%ecx + movl 8(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,8(%ebx) + jz L011aw_end + # Tail Round 3 + movl 12(%esi),%ecx + movl 12(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,12(%ebx) + jz L011aw_end + # Tail Round 4 + movl 16(%esi),%ecx + movl 16(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,16(%ebx) + jz L011aw_end + # Tail Round 5 + movl 20(%esi),%ecx + movl 20(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,20(%ebx) + jz L011aw_end + # Tail Round 6 + movl 24(%esi),%ecx + movl 24(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) +L011aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/bn-586-linux.S similarity index 53% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/bcm/bn-586-linux.S index ff3c9b61..43edba09 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/bn-586-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -13,20 +12,14 @@ .align 16 bn_mul_add_words: .L_bn_mul_add_words_begin: - call .L000PIC_me_up -.L000PIC_me_up: - popl %eax - leal OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc .L001maw_non_sse2 movl 4(%esp),%eax movl 8(%esp),%edx movl 12(%esp),%ecx movd 16(%esp),%mm0 pxor %mm1,%mm1 - jmp .L002maw_sse2_entry + jmp .L000maw_sse2_entry .align 16 -.L003maw_sse2_unrolled: +.L001maw_sse2_unrolled: movd (%eax),%mm3 paddq %mm3,%mm1 movd (%edx),%mm2 @@ -86,12 +79,12 @@ bn_mul_add_words: leal 32(%eax),%eax psrlq $32,%mm1 subl $8,%ecx - jz .L004maw_sse2_exit -.L002maw_sse2_entry: + jz .L002maw_sse2_exit +.L000maw_sse2_entry: testl $4294967288,%ecx - jnz .L003maw_sse2_unrolled + jnz .L001maw_sse2_unrolled .align 4 -.L005maw_sse2_loop: +.L003maw_sse2_loop: movd (%edx),%mm2 movd (%eax),%mm3 pmuludq %mm0,%mm2 @@ -102,189 +95,11 @@ bn_mul_add_words: subl $1,%ecx psrlq $32,%mm1 leal 4(%eax),%eax - jnz .L005maw_sse2_loop -.L004maw_sse2_exit: + jnz .L003maw_sse2_loop +.L002maw_sse2_exit: movd %mm1,%eax emms ret -.align 16 -.L001maw_non_sse2: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - xorl %esi,%esi - movl 20(%esp),%edi - movl 28(%esp),%ecx - movl 24(%esp),%ebx - andl $4294967288,%ecx - movl 32(%esp),%ebp - pushl %ecx - jz .L006maw_finish -.align 16 -.L007maw_loop: - - movl (%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl (%edi),%eax - adcl $0,%edx - movl %eax,(%edi) - movl %edx,%esi - - movl 4(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 4(%edi),%eax - adcl $0,%edx - movl %eax,4(%edi) - movl %edx,%esi - - movl 8(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 8(%edi),%eax - adcl $0,%edx - movl %eax,8(%edi) - movl %edx,%esi - - movl 12(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 12(%edi),%eax - adcl $0,%edx - movl %eax,12(%edi) - movl %edx,%esi - - movl 16(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 16(%edi),%eax - adcl $0,%edx - movl %eax,16(%edi) - movl %edx,%esi - - movl 20(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 20(%edi),%eax - adcl $0,%edx - movl %eax,20(%edi) - movl %edx,%esi - - movl 24(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 24(%edi),%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi - - movl 28(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 28(%edi),%eax - adcl $0,%edx - movl %eax,28(%edi) - movl %edx,%esi - - subl $8,%ecx - leal 32(%ebx),%ebx - leal 32(%edi),%edi - jnz .L007maw_loop -.L006maw_finish: - movl 32(%esp),%ecx - andl $7,%ecx - jnz .L008maw_finish2 - jmp .L009maw_end -.L008maw_finish2: - - movl (%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl (%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 4(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 4(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,4(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 8(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 8(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,8(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 12(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 12(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,12(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 16(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 16(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,16(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 20(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 20(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,20(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 24(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 24(%edi),%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi -.L009maw_end: - movl %esi,%eax - popl %ecx popl %edi popl %esi popl %ebx @@ -297,19 +112,13 @@ bn_mul_add_words: .align 16 bn_mul_words: .L_bn_mul_words_begin: - call .L010PIC_me_up -.L010PIC_me_up: - popl %eax - leal OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc .L011mw_non_sse2 movl 4(%esp),%eax movl 8(%esp),%edx movl 12(%esp),%ecx movd 16(%esp),%mm0 pxor %mm1,%mm1 .align 16 -.L012mw_sse2_loop: +.L004mw_sse2_loop: movd (%edx),%mm2 pmuludq %mm0,%mm2 leal 4(%edx),%edx @@ -318,156 +127,10 @@ bn_mul_words: subl $1,%ecx psrlq $32,%mm1 leal 4(%eax),%eax - jnz .L012mw_sse2_loop + jnz .L004mw_sse2_loop movd %mm1,%eax emms ret -.align 16 -.L011mw_non_sse2: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - xorl %esi,%esi - movl 20(%esp),%edi - movl 24(%esp),%ebx - movl 28(%esp),%ebp - movl 32(%esp),%ecx - andl $4294967288,%ebp - jz .L013mw_finish -.L014mw_loop: - - movl (%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,(%edi) - movl %edx,%esi - - movl 4(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,4(%edi) - movl %edx,%esi - - movl 8(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,8(%edi) - movl %edx,%esi - - movl 12(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,12(%edi) - movl %edx,%esi - - movl 16(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,16(%edi) - movl %edx,%esi - - movl 20(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,20(%edi) - movl %edx,%esi - - movl 24(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi - - movl 28(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,28(%edi) - movl %edx,%esi - - addl $32,%ebx - addl $32,%edi - subl $8,%ebp - jz .L013mw_finish - jmp .L014mw_loop -.L013mw_finish: - movl 28(%esp),%ebp - andl $7,%ebp - jnz .L015mw_finish2 - jmp .L016mw_end -.L015mw_finish2: - - movl (%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 4(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,4(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 8(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,8(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 12(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,12(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 16(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,16(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 20(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,20(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 24(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi -.L016mw_end: - movl %esi,%eax popl %edi popl %esi popl %ebx @@ -480,136 +143,20 @@ bn_mul_words: .align 16 bn_sqr_words: .L_bn_sqr_words_begin: - call .L017PIC_me_up -.L017PIC_me_up: - popl %eax - leal OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc .L018sqr_non_sse2 movl 4(%esp),%eax movl 8(%esp),%edx movl 12(%esp),%ecx .align 16 -.L019sqr_sse2_loop: +.L005sqr_sse2_loop: movd (%edx),%mm0 pmuludq %mm0,%mm0 leal 4(%edx),%edx movq %mm0,(%eax) subl $1,%ecx leal 8(%eax),%eax - jnz .L019sqr_sse2_loop + jnz .L005sqr_sse2_loop emms ret -.align 16 -.L018sqr_non_sse2: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%ebx - andl $4294967288,%ebx - jz .L020sw_finish -.L021sw_loop: - - movl (%edi),%eax - mull %eax - movl %eax,(%esi) - movl %edx,4(%esi) - - movl 4(%edi),%eax - mull %eax - movl %eax,8(%esi) - movl %edx,12(%esi) - - movl 8(%edi),%eax - mull %eax - movl %eax,16(%esi) - movl %edx,20(%esi) - - movl 12(%edi),%eax - mull %eax - movl %eax,24(%esi) - movl %edx,28(%esi) - - movl 16(%edi),%eax - mull %eax - movl %eax,32(%esi) - movl %edx,36(%esi) - - movl 20(%edi),%eax - mull %eax - movl %eax,40(%esi) - movl %edx,44(%esi) - - movl 24(%edi),%eax - mull %eax - movl %eax,48(%esi) - movl %edx,52(%esi) - - movl 28(%edi),%eax - mull %eax - movl %eax,56(%esi) - movl %edx,60(%esi) - - addl $32,%edi - addl $64,%esi - subl $8,%ebx - jnz .L021sw_loop -.L020sw_finish: - movl 28(%esp),%ebx - andl $7,%ebx - jz .L022sw_end - - movl (%edi),%eax - mull %eax - movl %eax,(%esi) - decl %ebx - movl %edx,4(%esi) - jz .L022sw_end - - movl 4(%edi),%eax - mull %eax - movl %eax,8(%esi) - decl %ebx - movl %edx,12(%esi) - jz .L022sw_end - - movl 8(%edi),%eax - mull %eax - movl %eax,16(%esi) - decl %ebx - movl %edx,20(%esi) - jz .L022sw_end - - movl 12(%edi),%eax - mull %eax - movl %eax,24(%esi) - decl %ebx - movl %edx,28(%esi) - jz .L022sw_end - - movl 16(%edi),%eax - mull %eax - movl %eax,32(%esi) - decl %ebx - movl %edx,36(%esi) - jz .L022sw_end - - movl 20(%edi),%eax - mull %eax - movl %eax,40(%esi) - decl %ebx - movl %edx,44(%esi) - jz .L022sw_end - - movl 24(%edi),%eax - mull %eax - movl %eax,48(%esi) - movl %edx,52(%esi) -.L022sw_end: popl %edi popl %esi popl %ebx @@ -645,8 +192,8 @@ bn_add_words: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz .L023aw_finish -.L024aw_loop: + jz .L006aw_finish +.L007aw_loop: movl (%esi),%ecx movl (%edi),%edx @@ -724,11 +271,11 @@ bn_add_words: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz .L024aw_loop -.L023aw_finish: + jnz .L007aw_loop +.L006aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz .L025aw_end + jz .L008aw_end movl (%esi),%ecx movl (%edi),%edx @@ -739,7 +286,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz .L025aw_end + jz .L008aw_end movl 4(%esi),%ecx movl 4(%edi),%edx @@ -750,7 +297,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz .L025aw_end + jz .L008aw_end movl 8(%esi),%ecx movl 8(%edi),%edx @@ -761,7 +308,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz .L025aw_end + jz .L008aw_end movl 12(%esi),%ecx movl 12(%edi),%edx @@ -772,7 +319,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz .L025aw_end + jz .L008aw_end movl 16(%esi),%ecx movl 16(%edi),%edx @@ -783,7 +330,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz .L025aw_end + jz .L008aw_end movl 20(%esi),%ecx movl 20(%edi),%edx @@ -794,7 +341,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz .L025aw_end + jz .L008aw_end movl 24(%esi),%ecx movl 24(%edi),%edx @@ -804,7 +351,7 @@ bn_add_words: addl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) -.L025aw_end: +.L008aw_end: popl %edi popl %esi popl %ebx @@ -828,8 +375,8 @@ bn_sub_words: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz .L026aw_finish -.L027aw_loop: + jz .L009aw_finish +.L010aw_loop: movl (%esi),%ecx movl (%edi),%edx @@ -907,11 +454,11 @@ bn_sub_words: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz .L027aw_loop -.L026aw_finish: + jnz .L010aw_loop +.L009aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz .L028aw_end + jz .L011aw_end movl (%esi),%ecx movl (%edi),%edx @@ -922,7 +469,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz .L028aw_end + jz .L011aw_end movl 4(%esi),%ecx movl 4(%edi),%edx @@ -933,7 +480,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz .L028aw_end + jz .L011aw_end movl 8(%esi),%ecx movl 8(%edi),%edx @@ -944,7 +491,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz .L028aw_end + jz .L011aw_end movl 12(%esi),%ecx movl 12(%edi),%edx @@ -955,7 +502,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz .L028aw_end + jz .L011aw_end movl 16(%esi),%ecx movl 16(%edi),%edx @@ -966,7 +513,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz .L028aw_end + jz .L011aw_end movl 20(%esi),%ecx movl 20(%edi),%edx @@ -977,7 +524,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz .L028aw_end + jz .L011aw_end movl 24(%esi),%ecx movl 24(%edi),%edx @@ -987,7 +534,7 @@ bn_sub_words: subl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) -.L028aw_end: +.L011aw_end: popl %edi popl %esi popl %ebx @@ -995,7 +542,6 @@ bn_sub_words: ret .size bn_sub_words,.-.L_bn_sub_words_begin #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-apple.S similarity index 95% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-apple.S index 4efdfa89..66e8c837 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -89,7 +88,6 @@ Lsub_exit: ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-linux.S similarity index 95% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-linux.S index 14d9eeee..9ea9ad7d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -89,7 +88,6 @@ bn_sub_words: ret .size bn_sub_words,.-bn_sub_words #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-win.S new file mode 100644 index 00000000..6c7a6a94 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-win.S @@ -0,0 +1,94 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include + +.text + +// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +// size_t num); + +.globl bn_add_words + +.align 4 +bn_add_words: + AARCH64_VALID_CALL_TARGET + # Clear the carry flag. + cmn xzr, xzr + + # aarch64 can load two registers at a time, so we do two loop iterations at + # at a time. Split x3 = 2 * x8 + x3. This allows loop + # operations to use CBNZ without clobbering the carry flag. + lsr x8, x3, #1 + and x3, x3, #1 + + cbz x8, Ladd_tail +Ladd_loop: + ldp x4, x5, [x1], #16 + ldp x6, x7, [x2], #16 + sub x8, x8, #1 + adcs x4, x4, x6 + adcs x5, x5, x7 + stp x4, x5, [x0], #16 + cbnz x8, Ladd_loop + +Ladd_tail: + cbz x3, Ladd_exit + ldr x4, [x1], #8 + ldr x6, [x2], #8 + adcs x4, x4, x6 + str x4, [x0], #8 + +Ladd_exit: + cset x0, cs + ret + + +// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +// size_t num); + +.globl bn_sub_words + +.align 4 +bn_sub_words: + AARCH64_VALID_CALL_TARGET + # Set the carry flag. Arm's borrow bit is flipped from the carry flag, + # so we want C = 1 here. + cmp xzr, xzr + + # aarch64 can load two registers at a time, so we do two loop iterations at + # at a time. Split x3 = 2 * x8 + x3. This allows loop + # operations to use CBNZ without clobbering the carry flag. + lsr x8, x3, #1 + and x3, x3, #1 + + cbz x8, Lsub_tail +Lsub_loop: + ldp x4, x5, [x1], #16 + ldp x6, x7, [x2], #16 + sub x8, x8, #1 + sbcs x4, x4, x6 + sbcs x5, x5, x7 + stp x4, x5, [x0], #16 + cbnz x8, Lsub_loop + +Lsub_tail: + cbz x3, Lsub_exit + ldr x4, [x1], #8 + ldr x6, [x2], #8 + sbcs x4, x4, x6 + str x4, [x0], #8 + +Lsub_exit: + cset x0, cc + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/bsaes-armv7-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-linux.linux.arm.S rename to Sources/CCryptoBoringSSL/gen/bcm/bsaes-armv7-linux.S index 5f512cc3..5d85cdfb 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-linux.linux.arm.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/bsaes-armv7-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1517,7 +1516,6 @@ bsaes_ctr32_encrypt_blocks: .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) -#endif // defined(__arm__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/co-586-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/co-586-apple.S new file mode 100644 index 00000000..935cc5ed --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/co-586-apple.S @@ -0,0 +1,1261 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _bn_mul_comba8 +.private_extern _bn_mul_comba8 +.align 4 +_bn_mul_comba8: +L_bn_mul_comba8_begin: + pushl %esi + movl 12(%esp),%esi + pushl %edi + movl 20(%esp),%edi + pushl %ebp + pushl %ebx + xorl %ebx,%ebx + movl (%esi),%eax + xorl %ecx,%ecx + movl (%edi),%edx + # ################## Calculate word 0 + xorl %ebp,%ebp + # mul a[0]*b[0] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,(%eax) + movl 4(%esi),%eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx,%ebx + # mul a[1]*b[0] + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + # mul a[0]*b[1] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl (%edi),%edx + adcl $0,%ebx + movl %ecx,4(%eax) + movl 8(%esi),%eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx,%ecx + # mul a[2]*b[0] + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 4(%edi),%edx + adcl $0,%ecx + # mul a[1]*b[1] + mull %edx + addl %eax,%ebp + movl (%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + # mul a[0]*b[2] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl (%edi),%edx + adcl $0,%ecx + movl %ebp,8(%eax) + movl 12(%esi),%eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp,%ebp + # mul a[3]*b[0] + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + # mul a[2]*b[1] + mull %edx + addl %eax,%ebx + movl 4(%esi),%eax + adcl %edx,%ecx + movl 8(%edi),%edx + adcl $0,%ebp + # mul a[1]*b[2] + mull %edx + addl %eax,%ebx + movl (%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + # mul a[0]*b[3] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,12(%eax) + movl 16(%esi),%eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx,%ebx + # mul a[4]*b[0] + mull %edx + addl %eax,%ecx + movl 12(%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + # mul a[3]*b[1] + mull %edx + addl %eax,%ecx + movl 8(%esi),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + # mul a[2]*b[2] + mull %edx + addl %eax,%ecx + movl 4(%esi),%eax + adcl %edx,%ebp + movl 12(%edi),%edx + adcl $0,%ebx + # mul a[1]*b[3] + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + # mul a[0]*b[4] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl (%edi),%edx + adcl $0,%ebx + movl %ecx,16(%eax) + movl 20(%esi),%eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx,%ecx + # mul a[5]*b[0] + mull %edx + addl %eax,%ebp + movl 16(%esi),%eax + adcl %edx,%ebx + movl 4(%edi),%edx + adcl $0,%ecx + # mul a[4]*b[1] + mull %edx + addl %eax,%ebp + movl 12(%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + # mul a[3]*b[2] + mull %edx + addl %eax,%ebp + movl 8(%esi),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + # mul a[2]*b[3] + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 16(%edi),%edx + adcl $0,%ecx + # mul a[1]*b[4] + mull %edx + addl %eax,%ebp + movl (%esi),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + # mul a[0]*b[5] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl (%edi),%edx + adcl $0,%ecx + movl %ebp,20(%eax) + movl 24(%esi),%eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp,%ebp + # mul a[6]*b[0] + mull %edx + addl %eax,%ebx + movl 20(%esi),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + # mul a[5]*b[1] + mull %edx + addl %eax,%ebx + movl 16(%esi),%eax + adcl %edx,%ecx + movl 8(%edi),%edx + adcl $0,%ebp + # mul a[4]*b[2] + mull %edx + addl %eax,%ebx + movl 12(%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + # mul a[3]*b[3] + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 16(%edi),%edx + adcl $0,%ebp + # mul a[2]*b[4] + mull %edx + addl %eax,%ebx + movl 4(%esi),%eax + adcl %edx,%ecx + movl 20(%edi),%edx + adcl $0,%ebp + # mul a[1]*b[5] + mull %edx + addl %eax,%ebx + movl (%esi),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + # mul a[0]*b[6] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,24(%eax) + movl 28(%esi),%eax + # saved r[6] + # ################## Calculate word 7 + xorl %ebx,%ebx + # mul a[7]*b[0] + mull %edx + addl %eax,%ecx + movl 24(%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + # mul a[6]*b[1] + mull %edx + addl %eax,%ecx + movl 20(%esi),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + # mul a[5]*b[2] + mull %edx + addl %eax,%ecx + movl 16(%esi),%eax + adcl %edx,%ebp + movl 12(%edi),%edx + adcl $0,%ebx + # mul a[4]*b[3] + mull %edx + addl %eax,%ecx + movl 12(%esi),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + # mul a[3]*b[4] + mull %edx + addl %eax,%ecx + movl 8(%esi),%eax + adcl %edx,%ebp + movl 20(%edi),%edx + adcl $0,%ebx + # mul a[2]*b[5] + mull %edx + addl %eax,%ecx + movl 4(%esi),%eax + adcl %edx,%ebp + movl 24(%edi),%edx + adcl $0,%ebx + # mul a[1]*b[6] + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + # mul a[0]*b[7] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + movl %ecx,28(%eax) + movl 28(%esi),%eax + # saved r[7] + # ################## Calculate word 8 + xorl %ecx,%ecx + # mul a[7]*b[1] + mull %edx + addl %eax,%ebp + movl 24(%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + # mul a[6]*b[2] + mull %edx + addl %eax,%ebp + movl 20(%esi),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + # mul a[5]*b[3] + mull %edx + addl %eax,%ebp + movl 16(%esi),%eax + adcl %edx,%ebx + movl 16(%edi),%edx + adcl $0,%ecx + # mul a[4]*b[4] + mull %edx + addl %eax,%ebp + movl 12(%esi),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + # mul a[3]*b[5] + mull %edx + addl %eax,%ebp + movl 8(%esi),%eax + adcl %edx,%ebx + movl 24(%edi),%edx + adcl $0,%ecx + # mul a[2]*b[6] + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 28(%edi),%edx + adcl $0,%ecx + # mul a[1]*b[7] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + movl %ebp,32(%eax) + movl 28(%esi),%eax + # saved r[8] + # ################## Calculate word 9 + xorl %ebp,%ebp + # mul a[7]*b[2] + mull %edx + addl %eax,%ebx + movl 24(%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + # mul a[6]*b[3] + mull %edx + addl %eax,%ebx + movl 20(%esi),%eax + adcl %edx,%ecx + movl 16(%edi),%edx + adcl $0,%ebp + # mul a[5]*b[4] + mull %edx + addl %eax,%ebx + movl 16(%esi),%eax + adcl %edx,%ecx + movl 20(%edi),%edx + adcl $0,%ebp + # mul a[4]*b[5] + mull %edx + addl %eax,%ebx + movl 12(%esi),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + # mul a[3]*b[6] + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 28(%edi),%edx + adcl $0,%ebp + # mul a[2]*b[7] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + movl %ebx,36(%eax) + movl 28(%esi),%eax + # saved r[9] + # ################## Calculate word 10 + xorl %ebx,%ebx + # mul a[7]*b[3] + mull %edx + addl %eax,%ecx + movl 24(%esi),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + # mul a[6]*b[4] + mull %edx + addl %eax,%ecx + movl 20(%esi),%eax + adcl %edx,%ebp + movl 20(%edi),%edx + adcl $0,%ebx + # mul a[5]*b[5] + mull %edx + addl %eax,%ecx + movl 16(%esi),%eax + adcl %edx,%ebp + movl 24(%edi),%edx + adcl $0,%ebx + # mul a[4]*b[6] + mull %edx + addl %eax,%ecx + movl 12(%esi),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + # mul a[3]*b[7] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + movl %ecx,40(%eax) + movl 28(%esi),%eax + # saved r[10] + # ################## Calculate word 11 + xorl %ecx,%ecx + # mul a[7]*b[4] + mull %edx + addl %eax,%ebp + movl 24(%esi),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + # mul a[6]*b[5] + mull %edx + addl %eax,%ebp + movl 20(%esi),%eax + adcl %edx,%ebx + movl 24(%edi),%edx + adcl $0,%ecx + # mul a[5]*b[6] + mull %edx + addl %eax,%ebp + movl 16(%esi),%eax + adcl %edx,%ebx + movl 28(%edi),%edx + adcl $0,%ecx + # mul a[4]*b[7] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + movl %ebp,44(%eax) + movl 28(%esi),%eax + # saved r[11] + # ################## Calculate word 12 + xorl %ebp,%ebp + # mul a[7]*b[5] + mull %edx + addl %eax,%ebx + movl 24(%esi),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + # mul a[6]*b[6] + mull %edx + addl %eax,%ebx + movl 20(%esi),%eax + adcl %edx,%ecx + movl 28(%edi),%edx + adcl $0,%ebp + # mul a[5]*b[7] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + movl %ebx,48(%eax) + movl 28(%esi),%eax + # saved r[12] + # ################## Calculate word 13 + xorl %ebx,%ebx + # mul a[7]*b[6] + mull %edx + addl %eax,%ecx + movl 24(%esi),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + # mul a[6]*b[7] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + movl %ecx,52(%eax) + movl 28(%esi),%eax + # saved r[13] + # ################## Calculate word 14 + xorl %ecx,%ecx + # mul a[7]*b[7] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + adcl $0,%ecx + movl %ebp,56(%eax) + # saved r[14] + # save r[15] + movl %ebx,60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.globl _bn_mul_comba4 +.private_extern _bn_mul_comba4 +.align 4 +_bn_mul_comba4: +L_bn_mul_comba4_begin: + pushl %esi + movl 12(%esp),%esi + pushl %edi + movl 20(%esp),%edi + pushl %ebp + pushl %ebx + xorl %ebx,%ebx + movl (%esi),%eax + xorl %ecx,%ecx + movl (%edi),%edx + # ################## Calculate word 0 + xorl %ebp,%ebp + # mul a[0]*b[0] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,(%eax) + movl 4(%esi),%eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx,%ebx + # mul a[1]*b[0] + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + # mul a[0]*b[1] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl (%edi),%edx + adcl $0,%ebx + movl %ecx,4(%eax) + movl 8(%esi),%eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx,%ecx + # mul a[2]*b[0] + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 4(%edi),%edx + adcl $0,%ecx + # mul a[1]*b[1] + mull %edx + addl %eax,%ebp + movl (%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + # mul a[0]*b[2] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl (%edi),%edx + adcl $0,%ecx + movl %ebp,8(%eax) + movl 12(%esi),%eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp,%ebp + # mul a[3]*b[0] + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + # mul a[2]*b[1] + mull %edx + addl %eax,%ebx + movl 4(%esi),%eax + adcl %edx,%ecx + movl 8(%edi),%edx + adcl $0,%ebp + # mul a[1]*b[2] + mull %edx + addl %eax,%ebx + movl (%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + # mul a[0]*b[3] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + movl %ebx,12(%eax) + movl 12(%esi),%eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx,%ebx + # mul a[3]*b[1] + mull %edx + addl %eax,%ecx + movl 8(%esi),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + # mul a[2]*b[2] + mull %edx + addl %eax,%ecx + movl 4(%esi),%eax + adcl %edx,%ebp + movl 12(%edi),%edx + adcl $0,%ebx + # mul a[1]*b[3] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + movl %ecx,16(%eax) + movl 12(%esi),%eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx,%ecx + # mul a[3]*b[2] + mull %edx + addl %eax,%ebp + movl 8(%esi),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + # mul a[2]*b[3] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + movl %ebp,20(%eax) + movl 12(%esi),%eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp,%ebp + # mul a[3]*b[3] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + adcl $0,%ebp + movl %ebx,24(%eax) + # saved r[6] + # save r[7] + movl %ecx,28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.globl _bn_sqr_comba8 +.private_extern _bn_sqr_comba8 +.align 4 +_bn_sqr_comba8: +L_bn_sqr_comba8_begin: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp),%edi + movl 24(%esp),%esi + xorl %ebx,%ebx + xorl %ecx,%ecx + movl (%esi),%eax + # ############### Calculate word 0 + xorl %ebp,%ebp + # sqr a[0]*a[0] + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl (%esi),%edx + adcl $0,%ebp + movl %ebx,(%edi) + movl 4(%esi),%eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx,%ebx + # sqr a[1]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + movl %ecx,4(%edi) + movl (%esi),%edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx,%ecx + # sqr a[2]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 4(%esi),%eax + adcl $0,%ecx + # sqr a[1]*a[1] + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + movl (%esi),%edx + adcl $0,%ecx + movl %ebp,8(%edi) + movl 12(%esi),%eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp,%ebp + # sqr a[3]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 8(%esi),%eax + adcl $0,%ebp + movl 4(%esi),%edx + # sqr a[2]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 16(%esi),%eax + adcl $0,%ebp + movl %ebx,12(%edi) + movl (%esi),%edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx,%ebx + # sqr a[4]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 12(%esi),%eax + adcl $0,%ebx + movl 4(%esi),%edx + # sqr a[3]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + # sqr a[2]*a[2] + mull %eax + addl %eax,%ecx + adcl %edx,%ebp + movl (%esi),%edx + adcl $0,%ebx + movl %ecx,16(%edi) + movl 20(%esi),%eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx,%ecx + # sqr a[5]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 16(%esi),%eax + adcl $0,%ecx + movl 4(%esi),%edx + # sqr a[4]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 12(%esi),%eax + adcl $0,%ecx + movl 8(%esi),%edx + # sqr a[3]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 24(%esi),%eax + adcl $0,%ecx + movl %ebp,20(%edi) + movl (%esi),%edx + # saved r[5] + # ############### Calculate word 6 + xorl %ebp,%ebp + # sqr a[6]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 20(%esi),%eax + adcl $0,%ebp + movl 4(%esi),%edx + # sqr a[5]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 16(%esi),%eax + adcl $0,%ebp + movl 8(%esi),%edx + # sqr a[4]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 12(%esi),%eax + adcl $0,%ebp + # sqr a[3]*a[3] + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl (%esi),%edx + adcl $0,%ebp + movl %ebx,24(%edi) + movl 28(%esi),%eax + # saved r[6] + # ############### Calculate word 7 + xorl %ebx,%ebx + # sqr a[7]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 24(%esi),%eax + adcl $0,%ebx + movl 4(%esi),%edx + # sqr a[6]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 20(%esi),%eax + adcl $0,%ebx + movl 8(%esi),%edx + # sqr a[5]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 16(%esi),%eax + adcl $0,%ebx + movl 12(%esi),%edx + # sqr a[4]*a[3] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 28(%esi),%eax + adcl $0,%ebx + movl %ecx,28(%edi) + movl 4(%esi),%edx + # saved r[7] + # ############### Calculate word 8 + xorl %ecx,%ecx + # sqr a[7]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 24(%esi),%eax + adcl $0,%ecx + movl 8(%esi),%edx + # sqr a[6]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 20(%esi),%eax + adcl $0,%ecx + movl 12(%esi),%edx + # sqr a[5]*a[3] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 16(%esi),%eax + adcl $0,%ecx + # sqr a[4]*a[4] + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + movl 8(%esi),%edx + adcl $0,%ecx + movl %ebp,32(%edi) + movl 28(%esi),%eax + # saved r[8] + # ############### Calculate word 9 + xorl %ebp,%ebp + # sqr a[7]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 24(%esi),%eax + adcl $0,%ebp + movl 12(%esi),%edx + # sqr a[6]*a[3] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 20(%esi),%eax + adcl $0,%ebp + movl 16(%esi),%edx + # sqr a[5]*a[4] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 28(%esi),%eax + adcl $0,%ebp + movl %ebx,36(%edi) + movl 12(%esi),%edx + # saved r[9] + # ############### Calculate word 10 + xorl %ebx,%ebx + # sqr a[7]*a[3] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 24(%esi),%eax + adcl $0,%ebx + movl 16(%esi),%edx + # sqr a[6]*a[4] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 20(%esi),%eax + adcl $0,%ebx + # sqr a[5]*a[5] + mull %eax + addl %eax,%ecx + adcl %edx,%ebp + movl 16(%esi),%edx + adcl $0,%ebx + movl %ecx,40(%edi) + movl 28(%esi),%eax + # saved r[10] + # ############### Calculate word 11 + xorl %ecx,%ecx + # sqr a[7]*a[4] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 24(%esi),%eax + adcl $0,%ecx + movl 20(%esi),%edx + # sqr a[6]*a[5] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 28(%esi),%eax + adcl $0,%ecx + movl %ebp,44(%edi) + movl 20(%esi),%edx + # saved r[11] + # ############### Calculate word 12 + xorl %ebp,%ebp + # sqr a[7]*a[5] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 24(%esi),%eax + adcl $0,%ebp + # sqr a[6]*a[6] + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl 24(%esi),%edx + adcl $0,%ebp + movl %ebx,48(%edi) + movl 28(%esi),%eax + # saved r[12] + # ############### Calculate word 13 + xorl %ebx,%ebx + # sqr a[7]*a[6] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 28(%esi),%eax + adcl $0,%ebx + movl %ecx,52(%edi) + # saved r[13] + # ############### Calculate word 14 + xorl %ecx,%ecx + # sqr a[7]*a[7] + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + adcl $0,%ecx + movl %ebp,56(%edi) + # saved r[14] + movl %ebx,60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.globl _bn_sqr_comba4 +.private_extern _bn_sqr_comba4 +.align 4 +_bn_sqr_comba4: +L_bn_sqr_comba4_begin: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp),%edi + movl 24(%esp),%esi + xorl %ebx,%ebx + xorl %ecx,%ecx + movl (%esi),%eax + # ############### Calculate word 0 + xorl %ebp,%ebp + # sqr a[0]*a[0] + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl (%esi),%edx + adcl $0,%ebp + movl %ebx,(%edi) + movl 4(%esi),%eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx,%ebx + # sqr a[1]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + movl %ecx,4(%edi) + movl (%esi),%edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx,%ecx + # sqr a[2]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 4(%esi),%eax + adcl $0,%ecx + # sqr a[1]*a[1] + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + movl (%esi),%edx + adcl $0,%ecx + movl %ebp,8(%edi) + movl 12(%esi),%eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp,%ebp + # sqr a[3]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 8(%esi),%eax + adcl $0,%ebp + movl 4(%esi),%edx + # sqr a[2]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 12(%esi),%eax + adcl $0,%ebp + movl %ebx,12(%edi) + movl 4(%esi),%edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx,%ebx + # sqr a[3]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + # sqr a[2]*a[2] + mull %eax + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%edx + adcl $0,%ebx + movl %ecx,16(%edi) + movl 12(%esi),%eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx,%ecx + # sqr a[3]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 12(%esi),%eax + adcl $0,%ecx + movl %ebp,20(%edi) + # saved r[5] + # ############### Calculate word 6 + xorl %ebp,%ebp + # sqr a[3]*a[3] + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + adcl $0,%ebp + movl %ebx,24(%edi) + # saved r[6] + movl %ecx,28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/co-586-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/bcm/co-586-linux.S index 4c61741a..486b1ce7 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/co-586-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1264,7 +1263,6 @@ bn_sqr_comba4: ret .size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-armv4-linux.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-linux.linux.arm.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-armv4-linux.S index 33cb9b7c..4c2cddd6 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-linux.linux.arm.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-armv4-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -244,7 +243,6 @@ gcm_ghash_neon: .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) -#endif // defined(__arm__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-apple.S index a1fa2e04..41de5a36 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -335,7 +334,6 @@ Lmasks: .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-linux.S index fd4185bd..449962d4 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -335,7 +334,6 @@ gcm_ghash_neon: .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-win.S new file mode 100644 index 00000000..996cf34d --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-win.S @@ -0,0 +1,346 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include + +.text + +.globl gcm_init_neon + +.def gcm_init_neon + .type 32 +.endef +.align 4 +gcm_init_neon: + AARCH64_VALID_CALL_TARGET + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret + + +.globl gcm_gmult_neon + +.def gcm_gmult_neon + .type 32 +.endef +.align 4 +gcm_gmult_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks // load constants + add x9, x9, :lo12:Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b Lgmult_neon + + +.globl gcm_ghash_neon + +.def gcm_ghash_neon + .type 32 +.endef +.align 4 +gcm_ghash_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks // load constants + add x9, x9, :lo12:Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret + + +.section .rodata +.align 4 +Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-apple.S new file mode 100644 index 00000000..5f016a29 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-apple.S @@ -0,0 +1,293 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _gcm_gmult_ssse3 +.private_extern _gcm_gmult_ssse3 +.align 4 +_gcm_gmult_ssse3: +L_gcm_gmult_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movdqu (%edi),%xmm0 + call L000pic_point +L000pic_point: + popl %eax + movdqa Lreverse_bytes-L000pic_point(%eax),%xmm7 + movdqa Llow4_mask-L000pic_point(%eax),%xmm2 +.byte 102,15,56,0,199 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L001loop_row_1: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L001loop_row_1 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L002loop_row_2: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L002loop_row_2 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +L003loop_row_3: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L003loop_row_3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,0,215 + movdqu %xmm2,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _gcm_ghash_ssse3 +.private_extern _gcm_ghash_ssse3 +.align 4 +_gcm_ghash_ssse3: +L_gcm_ghash_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + movdqu (%edi),%xmm0 + call L004pic_point +L004pic_point: + popl %ebx + movdqa Lreverse_bytes-L004pic_point(%ebx),%xmm7 + andl $-16,%ecx +.byte 102,15,56,0,199 + pxor %xmm3,%xmm3 +L005loop_ghash: + movdqa Llow4_mask-L004pic_point(%ebx),%xmm2 + movdqu (%edx),%xmm1 +.byte 102,15,56,0,207 + pxor %xmm1,%xmm0 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + movl $5,%eax +L006loop_row_4: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L006loop_row_4 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L007loop_row_5: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L007loop_row_5 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +L008loop_row_6: + movdqa (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 +.byte 102,15,58,15,243,1 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 +.byte 102,15,56,0,224 +.byte 102,15,56,0,233 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L008loop_row_6 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + leal -256(%esi),%esi + leal 16(%edx),%edx + subl $16,%ecx + jnz L005loop_ghash +.byte 102,15,56,0,199 + movdqu %xmm0,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 4,0x90 +Lreverse_bytes: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.align 4,0x90 +Llow4_mask: +.long 252645135,252645135,252645135,252645135 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-linux.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-linux.S index cfa55517..dc28a922 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -292,7 +291,6 @@ gcm_ghash_ssse3: .Llow4_mask: .long 252645135,252645135,252645135,252645135 #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-apple.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-apple.S index a334b97d..b53ae17c 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -423,7 +422,6 @@ L$low4_mask: .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f .text #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-linux.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-linux.S index f16865c0..5967f1b2 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -423,7 +422,6 @@ _CET_ENDBR .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f .text #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-apple.S new file mode 100644 index 00000000..25dfc3c7 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-apple.S @@ -0,0 +1,327 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _gcm_init_clmul +.private_extern _gcm_init_clmul +.align 4 +_gcm_init_clmul: +L_gcm_init_clmul_begin: + movl 4(%esp),%edx + movl 8(%esp),%eax + call L000pic +L000pic: + popl %ecx + leal Lbswap-L000pic(%ecx),%ecx + movdqu (%eax),%xmm2 + pshufd $78,%xmm2,%xmm2 + pshufd $255,%xmm2,%xmm4 + movdqa %xmm2,%xmm3 + psllq $1,%xmm2 + pxor %xmm5,%xmm5 + psrlq $63,%xmm3 + pcmpgtd %xmm4,%xmm5 + pslldq $8,%xmm3 + por %xmm3,%xmm2 + pand 16(%ecx),%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm2,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,(%edx) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%edx) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%edx) + ret +.globl _gcm_gmult_clmul +.private_extern _gcm_gmult_clmul +.align 4 +_gcm_gmult_clmul: +L_gcm_gmult_clmul_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + call L001pic +L001pic: + popl %ecx + leal Lbswap-L001pic(%ecx),%ecx + movdqu (%eax),%xmm0 + movdqa (%ecx),%xmm5 + movups (%edx),%xmm2 +.byte 102,15,56,0,197 + movups 32(%edx),%xmm4 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,197 + movdqu %xmm0,(%eax) + ret +.globl _gcm_ghash_clmul +.private_extern _gcm_ghash_clmul +.align 4 +_gcm_ghash_clmul: +L_gcm_ghash_clmul_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%eax + movl 24(%esp),%edx + movl 28(%esp),%esi + movl 32(%esp),%ebx + call L002pic +L002pic: + popl %ecx + leal Lbswap-L002pic(%ecx),%ecx + movdqu (%eax),%xmm0 + movdqa (%ecx),%xmm5 + movdqu (%edx),%xmm2 +.byte 102,15,56,0,197 + subl $16,%ebx + jz L003odd_tail + movdqu (%esi),%xmm3 + movdqu 16(%esi),%xmm6 +.byte 102,15,56,0,221 +.byte 102,15,56,0,245 + movdqu 32(%edx),%xmm5 + pxor %xmm3,%xmm0 + pshufd $78,%xmm6,%xmm3 + movdqa %xmm6,%xmm7 + pxor %xmm6,%xmm3 + leal 32(%esi),%esi +.byte 102,15,58,68,242,0 +.byte 102,15,58,68,250,17 +.byte 102,15,58,68,221,0 + movups 16(%edx),%xmm2 + nop + subl $32,%ebx + jbe L004even_tail + jmp L005mod_loop +.align 5,0x90 +L005mod_loop: + pshufd $78,%xmm0,%xmm4 + movdqa %xmm0,%xmm1 + pxor %xmm0,%xmm4 + nop +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,229,16 + movups (%edx),%xmm2 + xorps %xmm6,%xmm0 + movdqa (%ecx),%xmm5 + xorps %xmm7,%xmm1 + movdqu (%esi),%xmm7 + pxor %xmm0,%xmm3 + movdqu 16(%esi),%xmm6 + pxor %xmm1,%xmm3 +.byte 102,15,56,0,253 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm4 + pslldq $8,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm3,%xmm0 +.byte 102,15,56,0,245 + pxor %xmm7,%xmm1 + movdqa %xmm6,%xmm7 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 +.byte 102,15,58,68,242,0 + movups 32(%edx),%xmm5 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + pshufd $78,%xmm7,%xmm3 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm7,%xmm3 + pxor %xmm4,%xmm1 +.byte 102,15,58,68,250,17 + movups 16(%edx),%xmm2 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.byte 102,15,58,68,221,0 + leal 32(%esi),%esi + subl $32,%ebx + ja L005mod_loop +L004even_tail: + pshufd $78,%xmm0,%xmm4 + movdqa %xmm0,%xmm1 + pxor %xmm0,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,229,16 + movdqa (%ecx),%xmm5 + xorps %xmm6,%xmm0 + xorps %xmm7,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm4 + pslldq $8,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + testl %ebx,%ebx + jnz L006done + movups (%edx),%xmm2 +L003odd_tail: + movdqu (%esi),%xmm3 +.byte 102,15,56,0,221 + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +L006done: +.byte 102,15,56,0,197 + movdqu %xmm0,(%eax) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +Lbswap: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 +.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 +.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 +.byte 0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-linux.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-linux.S index 1acf76aa..2c54b18e 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -328,7 +327,6 @@ gcm_ghash_clmul: .byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 .byte 0 #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-apple.S index bd76e53b..310c052a 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -614,6 +613,7 @@ L$done: .p2align 5 _gcm_init_avx: + _CET_ENDBR vzeroupper @@ -736,6 +736,7 @@ _CET_ENDBR .p2align 5 _gcm_ghash_avx: + _CET_ENDBR vzeroupper @@ -1125,7 +1126,6 @@ L$7_mask: .p2align 6 .text #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-linux.S index 8f473262..4cfc5b30 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -614,6 +613,7 @@ _CET_ENDBR .align 32 gcm_init_avx: .cfi_startproc + _CET_ENDBR vzeroupper @@ -736,6 +736,7 @@ _CET_ENDBR .align 32 gcm_ghash_avx: .cfi_startproc + _CET_ENDBR vzeroupper @@ -1125,7 +1126,6 @@ _CET_ENDBR .align 64 .text #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv7-linux.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-linux.linux.arm.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv7-linux.S index 2d9c60b5..f552f576 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-linux.linux.arm.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv7-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -246,7 +245,6 @@ gcm_ghash_v8: .align 2 #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) -#endif // defined(__arm__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-apple.S index 0e7c3c4a..467df8f2 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -565,7 +564,6 @@ Ldone4x: .align 2 #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-linux.S index 6f861858..edfda39a 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -565,7 +564,6 @@ gcm_ghash_v8_4x: .align 2 #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-win.S new file mode 100644 index 00000000..d0b9a02c --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-win.S @@ -0,0 +1,578 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include + +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv8-a+crypto +.globl gcm_init_v8 + +.def gcm_init_v8 + .type 32 +.endef +.align 4 +gcm_init_v8: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v18.16b,v18.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + //calculate H^3 and H^4 + pmull v0.1q,v20.1d, v22.1d + pmull v5.1q,v22.1d,v22.1d + pmull2 v2.1q,v20.2d, v22.2d + pmull2 v7.1q,v22.2d,v22.2d + pmull v1.1q,v16.1d,v17.1d + pmull v6.1q,v17.1d,v17.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v20.16b, v0.16b,v18.16b //H^3 + eor v22.16b,v5.16b,v4.16b //H^4 + + ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing + ext v17.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v20.16b + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] + ret + +.globl gcm_gmult_v8 + +.def gcm_gmult_v8 + .type 32 +.endef +.align 4 +gcm_gmult_v8: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + shl v19.2d,v19.2d,#57 +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.globl gcm_ghash_v8 + +.def gcm_ghash_v8 + .type 32 +.endef +.align 4 +gcm_ghash_v8: + AARCH64_VALID_CALL_TARGET + cmp x3,#64 + b.hs Lgcm_ghash_v8_4x + ld1 {v0.2d},[x0] //load [rotated] Xi + //"[rotated]" means that + //loaded value would have + //to be rotated in order to + //make it appear as in + //algorithm specification + subs x3,x3,#32 //see if x3 is 32 or larger + mov x12,#16 //x12 is used as post- + //increment for input pointer; + //as loop is modulo-scheduled + //x12 is zeroed just in time + //to preclude overstepping + //inp[len], which means that + //last block[s] are actually + //loaded twice, but last + //copy is not processed + ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v22.2d},[x1] + csel x12,xzr,x12,eq //is it time to zero x12? + ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi + ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant +#ifndef __AARCH64EB__ + rev64 v16.16b,v16.16b + rev64 v0.16b,v0.16b +#endif + ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] + b.lo Lodd_tail_v8 //x3 was less than 32 + ld1 {v17.2d},[x2],x12 //load [rotated] I[1] +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v7.16b,v17.16b,v17.16b,#8 + eor v3.16b,v3.16b,v0.16b //I[i]^=Xi + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + pmull2 v6.1q,v20.2d,v7.2d + b Loop_mod2x_v8 + +.align 4 +Loop_mod2x_v8: + ext v18.16b,v3.16b,v3.16b,#8 + subs x3,x3,#32 //is there more data? + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + csel x12,xzr,x12,lo //is it time to zero x12? + + pmull v5.1q,v21.1d,v17.1d + eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + eor v0.16b,v0.16b,v4.16b //accumulate + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + + eor v2.16b,v2.16b,v6.16b + csel x12,xzr,x12,eq //is it time to zero x12? + eor v1.16b,v1.16b,v5.16b + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] +#ifndef __AARCH64EB__ + rev64 v16.16b,v16.16b +#endif + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v7.16b,v17.16b,v17.16b,#8 + ext v3.16b,v16.16b,v16.16b,#8 + eor v0.16b,v1.16b,v18.16b + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v3.16b,v3.16b,v18.16b + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + eor v3.16b,v3.16b,v0.16b + pmull2 v6.1q,v20.2d,v7.2d + b.hs Loop_mod2x_v8 //there was at least 32 more bytes + + eor v2.16b,v2.16b,v18.16b + ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b + adds x3,x3,#32 //re-construct x3 + eor v0.16b,v0.16b,v2.16b //re-construct v0.16b + b.eq Ldone_v8 //is x3 zero? +Lodd_tail_v8: + ext v18.16b,v0.16b,v0.16b,#8 + eor v3.16b,v3.16b,v0.16b //inp^=Xi + eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +Ldone_v8: +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.def gcm_ghash_v8_4x + .type 32 +.endef +.align 4 +gcm_ghash_v8_4x: +Lgcm_ghash_v8_4x: + ld1 {v0.2d},[x0] //load [rotated] Xi + ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant + + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v7.16b,v7.16b + rev64 v4.16b,v4.16b +#endif + ext v25.16b,v7.16b,v7.16b,#8 + ext v24.16b,v6.16b,v6.16b,#8 + ext v23.16b,v5.16b,v5.16b,#8 + + pmull v29.1q,v20.1d,v25.1d //H·Ii+3 + eor v7.16b,v7.16b,v25.16b + pmull2 v31.1q,v20.2d,v25.2d + pmull v30.1q,v21.1d,v7.1d + + pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 + eor v6.16b,v6.16b,v24.16b + pmull2 v24.1q,v22.2d,v24.2d + pmull2 v6.1q,v21.2d,v6.2d + + eor v29.16b,v29.16b,v16.16b + eor v31.16b,v31.16b,v24.16b + eor v30.16b,v30.16b,v6.16b + + pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 + eor v5.16b,v5.16b,v23.16b + pmull2 v23.1q,v26.2d,v23.2d + pmull v5.1q,v27.1d,v5.1d + + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + eor v30.16b,v30.16b,v5.16b + + subs x3,x3,#128 + b.lo Ltail4x + + b Loop4x + +.align 4 +Loop4x: + eor v16.16b,v4.16b,v0.16b + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 + ext v3.16b,v16.16b,v16.16b,#8 +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v7.16b,v7.16b + rev64 v4.16b,v4.16b +#endif + + pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v28.2d,v3.2d + ext v25.16b,v7.16b,v7.16b,#8 + pmull2 v1.1q,v27.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + ext v24.16b,v6.16b,v6.16b,#8 + eor v1.16b,v1.16b,v30.16b + ext v23.16b,v5.16b,v5.16b,#8 + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + pmull v29.1q,v20.1d,v25.1d //H·Ii+3 + eor v7.16b,v7.16b,v25.16b + eor v1.16b,v1.16b,v17.16b + pmull2 v31.1q,v20.2d,v25.2d + eor v1.16b,v1.16b,v18.16b + pmull v30.1q,v21.1d,v7.1d + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 + eor v6.16b,v6.16b,v24.16b + pmull2 v24.1q,v22.2d,v24.2d + eor v0.16b,v1.16b,v18.16b + pmull2 v6.1q,v21.2d,v6.2d + + eor v29.16b,v29.16b,v16.16b + eor v31.16b,v31.16b,v24.16b + eor v30.16b,v30.16b,v6.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 + eor v5.16b,v5.16b,v23.16b + eor v18.16b,v18.16b,v2.16b + pmull2 v23.1q,v26.2d,v23.2d + pmull v5.1q,v27.1d,v5.1d + + eor v0.16b,v0.16b,v18.16b + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + ext v0.16b,v0.16b,v0.16b,#8 + eor v30.16b,v30.16b,v5.16b + + subs x3,x3,#64 + b.hs Loop4x + +Ltail4x: + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v28.2d,v3.2d + pmull2 v1.1q,v27.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + + adds x3,x3,#64 + b.eq Ldone4x + + cmp x3,#32 + b.lo Lone + b.eq Ltwo +Lthree: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d,v5.2d,v6.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v24.16b,v6.16b,v6.16b,#8 + ext v23.16b,v5.16b,v5.16b,#8 + eor v0.16b,v1.16b,v18.16b + + pmull v29.1q,v20.1d,v24.1d //H·Ii+2 + eor v6.16b,v6.16b,v24.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + pmull2 v31.1q,v20.2d,v24.2d + pmull v30.1q,v21.1d,v6.1d + eor v0.16b,v0.16b,v18.16b + pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 + eor v5.16b,v5.16b,v23.16b + ext v0.16b,v0.16b,v0.16b,#8 + + pmull2 v23.1q,v22.2d,v23.2d + eor v16.16b,v4.16b,v0.16b + pmull2 v5.1q,v21.2d,v5.2d + ext v3.16b,v16.16b,v16.16b,#8 + + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + eor v30.16b,v30.16b,v5.16b + + pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v26.2d,v3.2d + pmull v1.1q,v27.1d,v16.1d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + b Ldone4x + +.align 4 +Ltwo: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d,v5.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v23.16b,v5.16b,v5.16b,#8 + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + + pmull v29.1q,v20.1d,v23.1d //H·Ii+1 + eor v5.16b,v5.16b,v23.16b + + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull2 v31.1q,v20.2d,v23.2d + pmull v30.1q,v21.1d,v5.1d + + pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v22.2d,v3.2d + pmull2 v1.1q,v21.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + b Ldone4x + +.align 4 +Lone: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull v0.1q,v20.1d,v3.1d + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v20.2d,v3.2d + pmull v1.1q,v21.1d,v16.1d + +Ldone4x: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + st1 {v0.2d},[x0] //write out Xi + + ret + +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-apple.S index 41f40276..6bc1b99a 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1726,7 +1725,6 @@ Lselect_w7_loop: ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-linux.S index a3a54045..8d0bcba5 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1726,7 +1725,6 @@ ecp_nistz256_select_w7: ret .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-win.S b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-win.S new file mode 100644 index 00000000..6a71ed78 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-win.S @@ -0,0 +1,1771 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include "CCryptoBoringSSL_arm_arch.h" + +.section .rodata +.align 5 +Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +Lone: +.quad 1,0,0,0 +Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +LordK: +.quad 0xccd1c8aaee00bc4f +.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.text + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont + +.def ecp_nistz256_mul_mont + .type 32 +.endef +.align 4 +ecp_nistz256_mul_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont + +.def ecp_nistz256_sqr_mont + .type 32 +.endef +.align 4 +ecp_nistz256_sqr_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_div_by_2 + +.def ecp_nistz256_div_by_2 + .type 32 +.endef +.align 4 +ecp_nistz256_div_by_2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_div_by_2 + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_2 + +.def ecp_nistz256_mul_by_2 + .type 32 +.endef +.align 4 +ecp_nistz256_mul_by_2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + + bl __ecp_nistz256_add_to // ret = a+a // 2*a + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_3 + +.def ecp_nistz256_mul_by_3 + .type 32 +.endef +.align 4 +ecp_nistz256_mul_by_3: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + + bl __ecp_nistz256_add_to // ret = a+a // 2*a + + mov x8,x4 + mov x9,x5 + mov x10,x6 + mov x11,x7 + + bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_sub + +.def ecp_nistz256_sub + .type 32 +.endef +.align 4 +ecp_nistz256_sub: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg + +.def ecp_nistz256_neg + .type 32 +.endef +.align 4 +ecp_nistz256_neg: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x2,x1 + mov x14,xzr // a = 0 + mov x15,xzr + mov x16,xzr + mov x17,xzr + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to x4-x7 and b[0] - to x3 +.def __ecp_nistz256_mul_mont + .type 32 +.endef +.align 4 +__ecp_nistz256_mul_mont: + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x11,x7,x3 + ldr x3,[x2,#8] // b[1] + + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adc x19,xzr,x11 + mov x20,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(1+1)] // b[1+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(2+1)] // b[2+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + // last reduction + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adcs x17,x19,x11 + adc x19,x20,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to x4-x7 +.def __ecp_nistz256_sqr_mont + .type 32 +.endef +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x2,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + lsl x8,x14,#32 + adcs x1,x1,x11 + lsr x9,x14,#32 + adc x2,x2,x7 + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x2 + adc x19,xzr,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to +// x4-x7 and x8-x11. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... +.def __ecp_nistz256_add_to + .type 32 +.endef +.align 4 +__ecp_nistz256_add_to: + adds x14,x14,x8 // ret = a+b + adcs x15,x15,x9 + adcs x16,x16,x10 + adcs x17,x17,x11 + adc x1,xzr,xzr // zap x1 + + adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x1,xzr // did subtraction borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +.def __ecp_nistz256_sub_from + .type 32 +.endef +.align 4 +__ecp_nistz256_sub_from: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x14,x8 // ret = a-b + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbcs x17,x17,x11 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret + + +.def __ecp_nistz256_sub_morf + .type 32 +.endef +.align 4 +__ecp_nistz256_sub_morf: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x8,x14 // ret = b-a + sbcs x15,x9,x15 + sbcs x16,x10,x16 + sbcs x17,x11,x17 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret + + +.def __ecp_nistz256_div_by_2 + .type 32 +.endef +.align 4 +__ecp_nistz256_div_by_2: + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adcs x11,x17,x13 + adc x1,xzr,xzr // zap x1 + tst x14,#1 // is a even? + + csel x14,x14,x8,eq // ret = even ? a : a+modulus + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + csel x1,xzr,x1,eq + + lsr x14,x14,#1 // ret >>= 1 + orr x14,x14,x15,lsl#63 + lsr x15,x15,#1 + orr x15,x15,x16,lsl#63 + lsr x16,x16,#1 + orr x16,x16,x17,lsl#63 + lsr x17,x17,#1 + stp x14,x15,[x0] + orr x17,x17,x1,lsl#63 + stp x16,x17,[x0,#16] + + ret + +.globl ecp_nistz256_point_double + +.def ecp_nistz256_point_double + .type 32 +.endef +.align 5 +ecp_nistz256_point_double: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +Ldouble_shortcut: + ldp x14,x15,[x1,#32] + mov x21,x0 + ldp x16,x17,[x1,#48] + mov x22,x1 + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + mov x8,x14 + ldr x13,[x13,#24] + mov x9,x15 + ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[x22,#64+16] + add x0,sp,#0 + bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); + + add x0,sp,#64 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp x8,x9,[x22] + ldp x10,x11,[x22,#16] + mov x4,x14 // put Zsqr aside for p256_sub + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); + + add x2,x22,#0 + mov x14,x4 // restore Zsqr + mov x15,x5 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x16,x6 + mov x17,x7 + ldp x6,x7,[sp,#0+16] + add x0,sp,#64 + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add x0,sp,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr x3,[x22,#32] + ldp x4,x5,[x22,#64] + ldp x6,x7,[x22,#64+16] + add x2,x22,#32 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#0+16] + add x0,x21,#64 + bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); + + add x0,sp,#96 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr x3,[sp,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x0,x21,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add x2,sp,#64 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov x8,x14 // duplicate M + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 // put M aside + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to + mov x8,x4 // restore M + mov x9,x5 + ldr x3,[x22] // forward load for p256_mul_mont + mov x10,x6 + ldp x4,x5,[sp,#0] + mov x11,x7 + ldp x6,x7,[sp,#0+16] + bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); + + add x2,x22,#0 + add x0,sp,#0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#32+16] + add x0,sp,#96 + bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); + + add x0,x21,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add x2,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add x2,sp,#0 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr x3,[sp,#32] + mov x4,x14 // copy S + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x2,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add x2,x21,#32 + add x0,x21,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl ecp_nistz256_point_add + +.def ecp_nistz256_point_add + .type 32 +.endef +.align 5 +ecp_nistz256_point_add: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp x4,x5,[x2,#64] // in2_z + ldp x6,x7,[x2,#64+16] + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + orr x8,x4,x5 + orr x10,x6,x7 + orr x25,x8,x10 + cmp x25,#0 + csetm x25,ne // ~in2infty + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp x4,x5,[x22,#64] // in1_z + ldp x6,x7,[x22,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x2,x23,#64 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x22,#64 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#32] + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x2,x22,#32 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#352] + ldp x6,x7,[sp,#352+16] + add x2,x23,#32 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,sp,#320 + ldr x3,[sp,#192] // forward load for p256_mul_mont + ldp x4,x5,[x22] + ldp x6,x7,[x22,#16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x26,x14,x16 // ~is_equal(S1,S2) + + add x2,sp,#192 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr x3,[sp,#128] + ldp x4,x5,[x23] + ldp x6,x7,[x23,#16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add x2,sp,#256 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x14,x14,x16 // ~is_equal(U1,U2) + + mvn x27,x24 // -1/0 -> 0/-1 + mvn x28,x25 // -1/0 -> 0/-1 + orr x14,x14,x27 + orr x14,x14,x28 + orr x14,x14,x26 + cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +Ladd_double: + mov x1,x22 + mov x0,x21 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames + b Ldouble_shortcut + +.align 4 +Ladd_proceed: + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#64] + ldp x6,x7,[sp,#64+16] + add x2,x23,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr x3,[sp,#96] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,sp,#96 + add x0,sp,#224 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[sp,#128] + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#128 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#192 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#224 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#288 + ldr x3,[sp,#224] // forward load for p256_mul_mont + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,sp,#224 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#160 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#352 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + +Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl ecp_nistz256_point_add_affine + +.def ecp_nistz256_point_add_affine + .type 32 +.endef +.align 5 +ecp_nistz256_point_add_affine: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + ldp x4,x5,[x1,#64] // in1_z + ldp x6,x7,[x1,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + + ldp x14,x15,[x2] // in2_x + ldp x16,x17,[x2,#16] + ldp x8,x9,[x2,#32] // in2_y + ldp x10,x11,[x2,#48] + orr x14,x14,x15 + orr x16,x16,x17 + orr x8,x8,x9 + orr x10,x10,x11 + orr x14,x14,x16 + orr x8,x8,x10 + orr x25,x14,x8 + cmp x25,#0 + csetm x25,ne // ~in2infty + + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + ldr x3,[x23] + add x2,x23,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add x2,x22,#0 + ldr x3,[x22,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add x2,x22,#64 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#160] + ldp x6,x7,[sp,#160+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x23,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,x22,#32 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#192 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add x0,sp,#224 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x0,sp,#288 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,sp,#160 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[x22] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,x22,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#224 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#288 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#256 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#96 + ldr x3,[x22,#32] // forward load for p256_mul_mont + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,x22,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr x3,[sp,#192] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#192 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#128 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + adrp x23,Lone_mont-64 + add x23,x23,:lo12:Lone_mont-64 + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont + +.def ecp_nistz256_ord_mul_mont + .type 32 +.endef +.align 4 +ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,Lord + add x23,x23,:lo12:Lord + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x19,x7,x3 + + mul x24,x14,x23 + + adds x15,x15,x8 // accumulate high parts of multiplication + adcs x16,x16,x9 + adcs x17,x17,x10 + adc x19,x19,xzr + mov x20,xzr + ldr x3,[x2,#8*1] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*2] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*3] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + lsl x8,x24,#32 // last reduction + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// uint64_t rep); +.globl ecp_nistz256_ord_sqr_mont + +.def ecp_nistz256_ord_sqr_mont + .type 32 +.endef +.align 4 +ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,Lord + add x23,x23,:lo12:Lord + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + b Loop_ord_sqr + +.align 4 +Loop_ord_sqr: + sub x2,x2,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x3,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + mul x24,x14,x23 + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + adcs x1,x1,x11 + adc x3,x3,x7 + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + mul x24,x14,x23 + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x3 + adc x19,xzr,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x5,x15,x9,lo + csel x6,x16,x10,lo + csel x7,x17,x11,lo + + cbnz x2,Loop_ord_sqr + + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w5 + +.def ecp_nistz256_select_w5 + .type 32 +.endef +.align 4 +ecp_nistz256_select_w5: + AARCH64_VALID_CALL_TARGET + + // x10 := x0 + // w9 := 0; loop counter and incremented internal index + mov x10, x0 + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + movi v20.16b, #0 + movi v21.16b, #0 + +Lselect_w5_loop: + // Loop 16 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // continue loading ... + ld1 {v26.2d, v27.2d}, [x1],#32 + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + bit v20.16b, v26.16b, v3.16b + bit v21.16b, v27.16b, v3.16b + + // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back + tbz w9, #4, Lselect_w5_loop + + // Write [v16-v21] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 + st1 {v20.2d, v21.2d}, [x10] + + ret + + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w7 + +.def ecp_nistz256_select_w7 + .type 32 +.endef +.align 4 +ecp_nistz256_select_w7: + AARCH64_VALID_CALL_TARGET + + // w9 := 0; loop counter and incremented internal index + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + +Lselect_w7_loop: + // Loop 64 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back + tbz w9, #6, Lselect_w7_loop + + // Write [v16-v19] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] + + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-apple.S similarity index 95% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-apple.S index f9ef5575..0d3c4d85 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -9,7 +8,6 @@ .text - .section __DATA,__const .p2align 6 L$poly: @@ -94,18 +92,13 @@ L$neg_epilogue: -.globl _ecp_nistz256_ord_mul_mont -.private_extern _ecp_nistz256_ord_mul_mont +.globl _ecp_nistz256_ord_mul_mont_nohw +.private_extern _ecp_nistz256_ord_mul_mont_nohw .p2align 5 -_ecp_nistz256_ord_mul_mont: +_ecp_nistz256_ord_mul_mont_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je L$ecp_nistz256_ord_mul_montx pushq %rbp pushq %rbx @@ -423,18 +416,13 @@ L$ord_mul_epilogue: -.globl _ecp_nistz256_ord_sqr_mont -.private_extern _ecp_nistz256_ord_sqr_mont +.globl _ecp_nistz256_ord_sqr_mont_nohw +.private_extern _ecp_nistz256_ord_sqr_mont_nohw .p2align 5 -_ecp_nistz256_ord_sqr_mont: +_ecp_nistz256_ord_sqr_mont_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je L$ecp_nistz256_ord_sqr_montx pushq %rbp pushq %rbx @@ -716,11 +704,14 @@ L$ord_sqr_epilogue: +.globl _ecp_nistz256_ord_mul_mont_adx +.private_extern _ecp_nistz256_ord_mul_mont_adx .p2align 5 -ecp_nistz256_ord_mul_montx: +_ecp_nistz256_ord_mul_mont_adx: -L$ecp_nistz256_ord_mul_montx: +L$ecp_nistz256_ord_mul_mont_adx: +_CET_ENDBR pushq %rbp pushq %rbx @@ -952,11 +943,14 @@ L$ord_mulx_epilogue: +.globl _ecp_nistz256_ord_sqr_mont_adx +.private_extern _ecp_nistz256_ord_sqr_mont_adx .p2align 5 -ecp_nistz256_ord_sqr_montx: +_ecp_nistz256_ord_sqr_mont_adx: -L$ecp_nistz256_ord_sqr_montx: +_CET_ENDBR +L$ecp_nistz256_ord_sqr_mont_adx: pushq %rbp pushq %rbx @@ -1165,17 +1159,13 @@ L$ord_sqrx_epilogue: -.globl _ecp_nistz256_mul_mont -.private_extern _ecp_nistz256_mul_mont +.globl _ecp_nistz256_mul_mont_nohw +.private_extern _ecp_nistz256_mul_mont_nohw .p2align 5 -_ecp_nistz256_mul_mont: +_ecp_nistz256_mul_mont_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx -L$mul_mont: pushq %rbp pushq %rbx @@ -1189,8 +1179,6 @@ L$mul_mont: pushq %r15 L$mul_body: - cmpl $0x80100,%ecx - je L$mul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -1199,20 +1187,7 @@ L$mul_body: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq - jmp L$mul_mont_done - -.p2align 5 -L$mul_montx: - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - call __ecp_nistz256_mul_montx -L$mul_mont_done: movq 0(%rsp),%r15 movq 8(%rsp),%r14 @@ -1457,16 +1432,13 @@ __ecp_nistz256_mul_montq: -.globl _ecp_nistz256_sqr_mont -.private_extern _ecp_nistz256_sqr_mont +.globl _ecp_nistz256_sqr_mont_nohw +.private_extern _ecp_nistz256_sqr_mont_nohw .p2align 5 -_ecp_nistz256_sqr_mont: +_ecp_nistz256_sqr_mont_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx pushq %rbp pushq %rbx @@ -1480,26 +1452,13 @@ _CET_ENDBR pushq %r15 L$sqr_body: - cmpl $0x80100,%ecx - je L$sqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq - jmp L$sqr_mont_done - -.p2align 5 -L$sqr_montx: - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq -128(%rsi),%rsi - call __ecp_nistz256_sqr_montx -L$sqr_mont_done: movq 0(%rsp),%r15 movq 8(%rsp),%r14 @@ -1682,6 +1641,55 @@ __ecp_nistz256_sqr_montq: ret +.globl _ecp_nistz256_mul_mont_adx +.private_extern _ecp_nistz256_mul_mont_adx + +.p2align 5 +_ecp_nistz256_mul_mont_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx_body: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$mulx_epilogue: + ret + + + .p2align 5 __ecp_nistz256_mul_montx: @@ -1851,6 +1859,53 @@ __ecp_nistz256_mul_montx: +.globl _ecp_nistz256_sqr_mont_adx +.private_extern _ecp_nistz256_sqr_mont_adx + +.p2align 5 +_ecp_nistz256_sqr_mont_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$sqrx_body: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$sqrx_epilogue: + ret + + + .p2align 5 __ecp_nistz256_sqr_montx: @@ -1982,17 +2037,13 @@ __ecp_nistz256_sqr_montx: -.globl _ecp_nistz256_select_w5 -.private_extern _ecp_nistz256_select_w5 +.globl _ecp_nistz256_select_w5_nohw +.private_extern _ecp_nistz256_select_w5_nohw .p2align 5 -_ecp_nistz256_select_w5: +_ecp_nistz256_select_w5_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%rax - movq 8(%rax),%rax - testl $32,%eax - jnz L$avx2_select_w5 movdqa L$One(%rip),%xmm0 movd %edx,%xmm1 @@ -2045,22 +2096,18 @@ L$select_loop_sse_w5: movdqu %xmm7,80(%rdi) ret -L$SEH_end_ecp_nistz256_select_w5: +L$SEH_end_ecp_nistz256_select_w5_nohw: -.globl _ecp_nistz256_select_w7 -.private_extern _ecp_nistz256_select_w7 +.globl _ecp_nistz256_select_w7_nohw +.private_extern _ecp_nistz256_select_w7_nohw .p2align 5 -_ecp_nistz256_select_w7: +_ecp_nistz256_select_w7_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%rax - movq 8(%rax),%rax - testl $32,%eax - jnz L$avx2_select_w7 movdqa L$One(%rip),%xmm8 movd %edx,%xmm1 @@ -2102,15 +2149,17 @@ L$select_loop_sse_w7: movdqu %xmm5,48(%rdi) ret -L$SEH_end_ecp_nistz256_select_w7: +L$SEH_end_ecp_nistz256_select_w7_nohw: +.globl _ecp_nistz256_select_w5_avx2 +.private_extern _ecp_nistz256_select_w5_avx2 .p2align 5 -ecp_nistz256_avx2_select_w5: +_ecp_nistz256_select_w5_avx2: -L$avx2_select_w5: +_CET_ENDBR vzeroupper vmovdqa L$Two(%rip),%ymm0 @@ -2165,18 +2214,17 @@ L$select_loop_avx2_w5: vzeroupper ret -L$SEH_end_ecp_nistz256_avx2_select_w5: +L$SEH_end_ecp_nistz256_select_w5_avx2: -.globl _ecp_nistz256_avx2_select_w7 -.private_extern _ecp_nistz256_avx2_select_w7 +.globl _ecp_nistz256_select_w7_avx2 +.private_extern _ecp_nistz256_select_w7_avx2 .p2align 5 -_ecp_nistz256_avx2_select_w7: +_ecp_nistz256_select_w7_avx2: -L$avx2_select_w7: _CET_ENDBR vzeroupper vmovdqa L$Three(%rip),%ymm0 @@ -2247,7 +2295,7 @@ L$select_loop_avx2_w7: vzeroupper ret -L$SEH_end_ecp_nistz256_avx2_select_w7: +L$SEH_end_ecp_nistz256_select_w7_avx2: .p2align 5 @@ -2378,18 +2426,13 @@ __ecp_nistz256_mul_by_2q: ret -.globl _ecp_nistz256_point_double -.private_extern _ecp_nistz256_point_double +.globl _ecp_nistz256_point_double_nohw +.private_extern _ecp_nistz256_point_double_nohw .p2align 5 -_ecp_nistz256_point_double: +_ecp_nistz256_point_double_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je L$point_doublex pushq %rbp pushq %rbx @@ -2607,18 +2650,13 @@ L$point_doubleq_epilogue: ret -.globl _ecp_nistz256_point_add -.private_extern _ecp_nistz256_point_add +.globl _ecp_nistz256_point_add_nohw +.private_extern _ecp_nistz256_point_add_nohw .p2align 5 -_ecp_nistz256_point_add: +_ecp_nistz256_point_add_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je L$point_addx pushq %rbp pushq %rbx @@ -3039,18 +3077,13 @@ L$point_addq_epilogue: ret -.globl _ecp_nistz256_point_add_affine -.private_extern _ecp_nistz256_point_add_affine +.globl _ecp_nistz256_point_add_affine_nohw +.private_extern _ecp_nistz256_point_add_affine_nohw .p2align 5 -_ecp_nistz256_point_add_affine: +_ecp_nistz256_point_add_affine_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je L$point_add_affinex pushq %rbp pushq %rbx @@ -3503,11 +3536,13 @@ __ecp_nistz256_mul_by_2x: ret +.globl _ecp_nistz256_point_double_adx +.private_extern _ecp_nistz256_point_double_adx .p2align 5 -ecp_nistz256_point_doublex: +_ecp_nistz256_point_double_adx: -L$point_doublex: +_CET_ENDBR pushq %rbp pushq %rbx @@ -3725,11 +3760,13 @@ L$point_doublex_epilogue: ret +.globl _ecp_nistz256_point_add_adx +.private_extern _ecp_nistz256_point_add_adx .p2align 5 -ecp_nistz256_point_addx: +_ecp_nistz256_point_add_adx: -L$point_addx: +_CET_ENDBR pushq %rbp pushq %rbx @@ -4150,11 +4187,13 @@ L$point_addx_epilogue: ret +.globl _ecp_nistz256_point_add_affine_adx +.private_extern _ecp_nistz256_point_add_affine_adx .p2align 5 -ecp_nistz256_point_add_affinex: +_ecp_nistz256_point_add_affine_adx: -L$point_add_affinex: +_CET_ENDBR pushq %rbp pushq %rbx @@ -4473,7 +4512,6 @@ L$add_affinex_epilogue: #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-linux.S similarity index 92% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-linux.S index 7bd14501..eec11d7c 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -7,8 +6,6 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P .section .rodata @@ -97,18 +94,13 @@ _CET_ENDBR -.globl ecp_nistz256_ord_mul_mont -.hidden ecp_nistz256_ord_mul_mont -.type ecp_nistz256_ord_mul_mont,@function +.globl ecp_nistz256_ord_mul_mont_nohw +.hidden ecp_nistz256_ord_mul_mont_nohw +.type ecp_nistz256_ord_mul_mont_nohw,@function .align 32 -ecp_nistz256_ord_mul_mont: +ecp_nistz256_ord_mul_mont_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je .Lecp_nistz256_ord_mul_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -424,7 +416,7 @@ _CET_ENDBR .Lord_mul_epilogue: ret .cfi_endproc -.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont +.size ecp_nistz256_ord_mul_mont_nohw,.-ecp_nistz256_ord_mul_mont_nohw @@ -432,18 +424,13 @@ _CET_ENDBR -.globl ecp_nistz256_ord_sqr_mont -.hidden ecp_nistz256_ord_sqr_mont -.type ecp_nistz256_ord_sqr_mont,@function +.globl ecp_nistz256_ord_sqr_mont_nohw +.hidden ecp_nistz256_ord_sqr_mont_nohw +.type ecp_nistz256_ord_sqr_mont_nohw,@function .align 32 -ecp_nistz256_ord_sqr_mont: +ecp_nistz256_ord_sqr_mont_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je .Lecp_nistz256_ord_sqr_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -729,13 +716,16 @@ _CET_ENDBR .Lord_sqr_epilogue: ret .cfi_endproc -.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +.size ecp_nistz256_ord_sqr_mont_nohw,.-ecp_nistz256_ord_sqr_mont_nohw -.type ecp_nistz256_ord_mul_montx,@function +.globl ecp_nistz256_ord_mul_mont_adx +.hidden ecp_nistz256_ord_mul_mont_adx +.type ecp_nistz256_ord_mul_mont_adx,@function .align 32 -ecp_nistz256_ord_mul_montx: +ecp_nistz256_ord_mul_mont_adx: .cfi_startproc -.Lecp_nistz256_ord_mul_montx: +.Lecp_nistz256_ord_mul_mont_adx: +_CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -971,13 +961,16 @@ ecp_nistz256_ord_mul_montx: .Lord_mulx_epilogue: ret .cfi_endproc -.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx +.size ecp_nistz256_ord_mul_mont_adx,.-ecp_nistz256_ord_mul_mont_adx -.type ecp_nistz256_ord_sqr_montx,@function +.globl ecp_nistz256_ord_sqr_mont_adx +.hidden ecp_nistz256_ord_sqr_mont_adx +.type ecp_nistz256_ord_sqr_mont_adx,@function .align 32 -ecp_nistz256_ord_sqr_montx: +ecp_nistz256_ord_sqr_mont_adx: .cfi_startproc -.Lecp_nistz256_ord_sqr_montx: +_CET_ENDBR +.Lecp_nistz256_ord_sqr_mont_adx: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1185,24 +1178,20 @@ ecp_nistz256_ord_sqr_montx: .Lord_sqrx_epilogue: ret .cfi_endproc -.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx +.size ecp_nistz256_ord_sqr_mont_adx,.-ecp_nistz256_ord_sqr_mont_adx -.globl ecp_nistz256_mul_mont -.hidden ecp_nistz256_mul_mont -.type ecp_nistz256_mul_mont,@function +.globl ecp_nistz256_mul_mont_nohw +.hidden ecp_nistz256_mul_mont_nohw +.type ecp_nistz256_mul_mont_nohw,@function .align 32 -ecp_nistz256_mul_mont: +ecp_nistz256_mul_mont_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx -.Lmul_mont: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1222,8 +1211,6 @@ _CET_ENDBR .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lmul_body: - cmpl $0x80100,%ecx - je .Lmul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -1232,20 +1219,7 @@ _CET_ENDBR movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq - jmp .Lmul_mont_done -.align 32 -.Lmul_montx: - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_mul_montx -.Lmul_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 @@ -1263,7 +1237,7 @@ _CET_ENDBR .Lmul_epilogue: ret .cfi_endproc -.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont +.size ecp_nistz256_mul_mont_nohw,.-ecp_nistz256_mul_mont_nohw .type __ecp_nistz256_mul_montq,@function .align 32 @@ -1490,16 +1464,13 @@ __ecp_nistz256_mul_montq: -.globl ecp_nistz256_sqr_mont -.hidden ecp_nistz256_sqr_mont -.type ecp_nistz256_sqr_mont,@function +.globl ecp_nistz256_sqr_mont_nohw +.hidden ecp_nistz256_sqr_mont_nohw +.type ecp_nistz256_sqr_mont_nohw,@function .align 32 -ecp_nistz256_sqr_mont: +ecp_nistz256_sqr_mont_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1519,26 +1490,13 @@ _CET_ENDBR .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lsqr_body: - cmpl $0x80100,%ecx - je .Lsqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq - jmp .Lsqr_mont_done - -.align 32 -.Lsqr_montx: - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq -128(%rsi),%rsi - call __ecp_nistz256_sqr_montx -.Lsqr_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 @@ -1556,7 +1514,7 @@ _CET_ENDBR .Lsqr_epilogue: ret .cfi_endproc -.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont +.size ecp_nistz256_sqr_mont_nohw,.-ecp_nistz256_sqr_mont_nohw .type __ecp_nistz256_sqr_montq,@function .align 32 @@ -1721,6 +1679,61 @@ __ecp_nistz256_sqr_montq: ret .cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq +.globl ecp_nistz256_mul_mont_adx +.hidden ecp_nistz256_mul_mont_adx +.type ecp_nistz256_mul_mont_adx,@function +.align 32 +ecp_nistz256_mul_mont_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lmulx_body: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmulx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_mul_mont_adx,.-ecp_nistz256_mul_mont_adx + .type __ecp_nistz256_mul_montx,@function .align 32 __ecp_nistz256_mul_montx: @@ -1890,6 +1903,59 @@ __ecp_nistz256_mul_montx: .cfi_endproc .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx +.globl ecp_nistz256_sqr_mont_adx +.hidden ecp_nistz256_sqr_mont_adx +.type ecp_nistz256_sqr_mont_adx,@function +.align 32 +ecp_nistz256_sqr_mont_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsqrx_body: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqrx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_sqr_mont_adx,.-ecp_nistz256_sqr_mont_adx + .type __ecp_nistz256_sqr_montx,@function .align 32 __ecp_nistz256_sqr_montx: @@ -2021,17 +2087,13 @@ __ecp_nistz256_sqr_montx: .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx -.globl ecp_nistz256_select_w5 -.hidden ecp_nistz256_select_w5 -.type ecp_nistz256_select_w5,@function +.globl ecp_nistz256_select_w5_nohw +.hidden ecp_nistz256_select_w5_nohw +.type ecp_nistz256_select_w5_nohw,@function .align 32 -ecp_nistz256_select_w5: +ecp_nistz256_select_w5_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%rax - movq 8(%rax),%rax - testl $32,%eax - jnz .Lavx2_select_w5 movdqa .LOne(%rip),%xmm0 movd %edx,%xmm1 @@ -2084,22 +2146,18 @@ _CET_ENDBR movdqu %xmm7,80(%rdi) ret .cfi_endproc -.LSEH_end_ecp_nistz256_select_w5: -.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 +.LSEH_end_ecp_nistz256_select_w5_nohw: +.size ecp_nistz256_select_w5_nohw,.-ecp_nistz256_select_w5_nohw -.globl ecp_nistz256_select_w7 -.hidden ecp_nistz256_select_w7 -.type ecp_nistz256_select_w7,@function +.globl ecp_nistz256_select_w7_nohw +.hidden ecp_nistz256_select_w7_nohw +.type ecp_nistz256_select_w7_nohw,@function .align 32 -ecp_nistz256_select_w7: +ecp_nistz256_select_w7_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%rax - movq 8(%rax),%rax - testl $32,%eax - jnz .Lavx2_select_w7 movdqa .LOne(%rip),%xmm8 movd %edx,%xmm1 @@ -2141,15 +2199,17 @@ _CET_ENDBR movdqu %xmm5,48(%rdi) ret .cfi_endproc -.LSEH_end_ecp_nistz256_select_w7: -.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +.LSEH_end_ecp_nistz256_select_w7_nohw: +.size ecp_nistz256_select_w7_nohw,.-ecp_nistz256_select_w7_nohw -.type ecp_nistz256_avx2_select_w5,@function +.globl ecp_nistz256_select_w5_avx2 +.hidden ecp_nistz256_select_w5_avx2 +.type ecp_nistz256_select_w5_avx2,@function .align 32 -ecp_nistz256_avx2_select_w5: +ecp_nistz256_select_w5_avx2: .cfi_startproc -.Lavx2_select_w5: +_CET_ENDBR vzeroupper vmovdqa .LTwo(%rip),%ymm0 @@ -2204,18 +2264,17 @@ ecp_nistz256_avx2_select_w5: vzeroupper ret .cfi_endproc -.LSEH_end_ecp_nistz256_avx2_select_w5: -.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 +.LSEH_end_ecp_nistz256_select_w5_avx2: +.size ecp_nistz256_select_w5_avx2,.-ecp_nistz256_select_w5_avx2 -.globl ecp_nistz256_avx2_select_w7 -.hidden ecp_nistz256_avx2_select_w7 -.type ecp_nistz256_avx2_select_w7,@function +.globl ecp_nistz256_select_w7_avx2 +.hidden ecp_nistz256_select_w7_avx2 +.type ecp_nistz256_select_w7_avx2,@function .align 32 -ecp_nistz256_avx2_select_w7: +ecp_nistz256_select_w7_avx2: .cfi_startproc -.Lavx2_select_w7: _CET_ENDBR vzeroupper vmovdqa .LThree(%rip),%ymm0 @@ -2286,8 +2345,8 @@ _CET_ENDBR vzeroupper ret .cfi_endproc -.LSEH_end_ecp_nistz256_avx2_select_w7: -.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 +.LSEH_end_ecp_nistz256_select_w7_avx2: +.size ecp_nistz256_select_w7_avx2,.-ecp_nistz256_select_w7_avx2 .type __ecp_nistz256_add_toq,@function .align 32 __ecp_nistz256_add_toq: @@ -2417,18 +2476,13 @@ __ecp_nistz256_mul_by_2q: ret .cfi_endproc .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q -.globl ecp_nistz256_point_double -.hidden ecp_nistz256_point_double -.type ecp_nistz256_point_double,@function +.globl ecp_nistz256_point_double_nohw +.hidden ecp_nistz256_point_double_nohw +.type ecp_nistz256_point_double_nohw,@function .align 32 -ecp_nistz256_point_double: +ecp_nistz256_point_double_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je .Lpoint_doublex pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2651,19 +2705,14 @@ _CET_ENDBR .Lpoint_doubleq_epilogue: ret .cfi_endproc -.size ecp_nistz256_point_double,.-ecp_nistz256_point_double -.globl ecp_nistz256_point_add -.hidden ecp_nistz256_point_add -.type ecp_nistz256_point_add,@function +.size ecp_nistz256_point_double_nohw,.-ecp_nistz256_point_double_nohw +.globl ecp_nistz256_point_add_nohw +.hidden ecp_nistz256_point_add_nohw +.type ecp_nistz256_point_add_nohw,@function .align 32 -ecp_nistz256_point_add: +ecp_nistz256_point_add_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je .Lpoint_addx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3089,19 +3138,14 @@ _CET_ENDBR .Lpoint_addq_epilogue: ret .cfi_endproc -.size ecp_nistz256_point_add,.-ecp_nistz256_point_add -.globl ecp_nistz256_point_add_affine -.hidden ecp_nistz256_point_add_affine -.type ecp_nistz256_point_add_affine,@function +.size ecp_nistz256_point_add_nohw,.-ecp_nistz256_point_add_nohw +.globl ecp_nistz256_point_add_affine_nohw +.hidden ecp_nistz256_point_add_affine_nohw +.type ecp_nistz256_point_add_affine_nohw,@function .align 32 -ecp_nistz256_point_add_affine: +ecp_nistz256_point_add_affine_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je .Lpoint_add_affinex pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3424,7 +3468,7 @@ _CET_ENDBR .Ladd_affineq_epilogue: ret .cfi_endproc -.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +.size ecp_nistz256_point_add_affine_nohw,.-ecp_nistz256_point_add_affine_nohw .type __ecp_nistz256_add_tox,@function .align 32 __ecp_nistz256_add_tox: @@ -3560,11 +3604,13 @@ __ecp_nistz256_mul_by_2x: ret .cfi_endproc .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x -.type ecp_nistz256_point_doublex,@function +.globl ecp_nistz256_point_double_adx +.hidden ecp_nistz256_point_double_adx +.type ecp_nistz256_point_double_adx,@function .align 32 -ecp_nistz256_point_doublex: +ecp_nistz256_point_double_adx: .cfi_startproc -.Lpoint_doublex: +_CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3787,12 +3833,14 @@ ecp_nistz256_point_doublex: .Lpoint_doublex_epilogue: ret .cfi_endproc -.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex -.type ecp_nistz256_point_addx,@function +.size ecp_nistz256_point_double_adx,.-ecp_nistz256_point_double_adx +.globl ecp_nistz256_point_add_adx +.hidden ecp_nistz256_point_add_adx +.type ecp_nistz256_point_add_adx,@function .align 32 -ecp_nistz256_point_addx: +ecp_nistz256_point_add_adx: .cfi_startproc -.Lpoint_addx: +_CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -4218,12 +4266,14 @@ ecp_nistz256_point_addx: .Lpoint_addx_epilogue: ret .cfi_endproc -.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx -.type ecp_nistz256_point_add_affinex,@function +.size ecp_nistz256_point_add_adx,.-ecp_nistz256_point_add_adx +.globl ecp_nistz256_point_add_affine_adx +.hidden ecp_nistz256_point_add_affine_adx +.type ecp_nistz256_point_add_affine_adx,@function .align 32 -ecp_nistz256_point_add_affinex: +ecp_nistz256_point_add_affine_adx: .cfi_startproc -.Lpoint_add_affinex: +_CET_ENDBR pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -4546,9 +4596,8 @@ ecp_nistz256_point_add_affinex: .Ladd_affinex_epilogue: ret .cfi_endproc -.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex +.size ecp_nistz256_point_add_affine_adx,.-ecp_nistz256_point_add_affine_adx #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-apple.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-apple.S index 9a5941d6..1acdb0b2 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -309,7 +308,6 @@ Lbeeu_finish: ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-linux.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-linux.S index 7f6451f7..061c9d54 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -309,7 +308,6 @@ beeu_mod_inverse_vartime: ret .size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-win.S b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-win.S new file mode 100644 index 00000000..2f7905d2 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-win.S @@ -0,0 +1,314 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include "CCryptoBoringSSL_arm_arch.h" + +.text +.globl beeu_mod_inverse_vartime + + +.align 4 +beeu_mod_inverse_vartime: + // Reserve enough space for 14 8-byte registers on the stack + // in the first stp call for x29, x30. + // Then store the remaining callee-saved registers. + // + // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 | + // ^ ^ + // sp <------------------- 112 bytes ----------------> old sp + // x29 (FP) + // + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-112]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x2,[sp,#96] + + // B = b3..b0 := a + ldp x25,x26,[x1] + ldp x27,x28,[x1,#16] + + // n3..n0 := n + // Note: the value of input params are changed in the following. + ldp x0,x1,[x2] + ldp x2,x30,[x2,#16] + + // A = a3..a0 := n + mov x21, x0 + mov x22, x1 + mov x23, x2 + mov x24, x30 + + // X = x4..x0 := 1 + mov x3, #1 + eor x4, x4, x4 + eor x5, x5, x5 + eor x6, x6, x6 + eor x7, x7, x7 + + // Y = y4..y0 := 0 + eor x8, x8, x8 + eor x9, x9, x9 + eor x10, x10, x10 + eor x11, x11, x11 + eor x12, x12, x12 + +Lbeeu_loop: + // if B == 0, jump to .Lbeeu_loop_end + orr x14, x25, x26 + orr x14, x14, x27 + + // reverse the bit order of x25. This is needed for clz after this macro + rbit x15, x25 + + orr x14, x14, x28 + cbz x14,Lbeeu_loop_end + + + // 0 < B < |n|, + // 0 < A <= |n|, + // (1) X*a == B (mod |n|), + // (2) (-1)*Y*a == A (mod |n|) + + // Now divide B by the maximum possible power of two in the + // integers, and divide X by the same value mod |n|. + // When we're done, (1) still holds. + + // shift := number of trailing 0s in x25 + // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO) + clz x13, x15 + + // If there is no shift, goto shift_A_Y + cbz x13, Lbeeu_shift_A_Y + + // Shift B right by "x13" bits + neg x14, x13 + lsr x25, x25, x13 + lsl x15, x26, x14 + + lsr x26, x26, x13 + lsl x19, x27, x14 + + orr x25, x25, x15 + + lsr x27, x27, x13 + lsl x20, x28, x14 + + orr x26, x26, x19 + + lsr x28, x28, x13 + + orr x27, x27, x20 + + + // Shift X right by "x13" bits, adding n whenever X becomes odd. + // x13--; + // x14 := 0; needed in the addition to the most significant word in SHIFT1 + eor x14, x14, x14 +Lbeeu_shift_loop_X: + tbz x3, #0, Lshift1_0 + adds x3, x3, x0 + adcs x4, x4, x1 + adcs x5, x5, x2 + adcs x6, x6, x30 + adc x7, x7, x14 +Lshift1_0: + // var0 := [var1|var0]<64..1>; + // i.e. concatenate var1 and var0, + // extract bits <64..1> from the resulting 128-bit value + // and put them in var0 + extr x3, x4, x3, #1 + extr x4, x5, x4, #1 + extr x5, x6, x5, #1 + extr x6, x7, x6, #1 + lsr x7, x7, #1 + + subs x13, x13, #1 + bne Lbeeu_shift_loop_X + + // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl + // with the following differences: + // - "x13" is set directly to the number of trailing 0s in B + // (using rbit and clz instructions) + // - The loop is only used to call SHIFT1(X) + // and x13 is decreased while executing the X loop. + // - SHIFT256(B, x13) is performed before right-shifting X; they are independent + +Lbeeu_shift_A_Y: + // Same for A and Y. + // Afterwards, (2) still holds. + // Reverse the bit order of x21 + // x13 := number of trailing 0s in x21 (= number of leading 0s in x15) + rbit x15, x21 + clz x13, x15 + + // If there is no shift, goto |B-A|, X+Y update + cbz x13, Lbeeu_update_B_X_or_A_Y + + // Shift A right by "x13" bits + neg x14, x13 + lsr x21, x21, x13 + lsl x15, x22, x14 + + lsr x22, x22, x13 + lsl x19, x23, x14 + + orr x21, x21, x15 + + lsr x23, x23, x13 + lsl x20, x24, x14 + + orr x22, x22, x19 + + lsr x24, x24, x13 + + orr x23, x23, x20 + + + // Shift Y right by "x13" bits, adding n whenever Y becomes odd. + // x13--; + // x14 := 0; needed in the addition to the most significant word in SHIFT1 + eor x14, x14, x14 +Lbeeu_shift_loop_Y: + tbz x8, #0, Lshift1_1 + adds x8, x8, x0 + adcs x9, x9, x1 + adcs x10, x10, x2 + adcs x11, x11, x30 + adc x12, x12, x14 +Lshift1_1: + // var0 := [var1|var0]<64..1>; + // i.e. concatenate var1 and var0, + // extract bits <64..1> from the resulting 128-bit value + // and put them in var0 + extr x8, x9, x8, #1 + extr x9, x10, x9, #1 + extr x10, x11, x10, #1 + extr x11, x12, x11, #1 + lsr x12, x12, #1 + + subs x13, x13, #1 + bne Lbeeu_shift_loop_Y + +Lbeeu_update_B_X_or_A_Y: + // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow) + // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words + // without taking a sign bit if generated. The lack of a carry would + // indicate a negative result. See, for example, + // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes + subs x14, x25, x21 + sbcs x15, x26, x22 + sbcs x19, x27, x23 + sbcs x20, x28, x24 + bcs Lbeeu_B_greater_than_A + + // Else A > B => + // A := A - B; Y := Y + X; goto beginning of the loop + subs x21, x21, x25 + sbcs x22, x22, x26 + sbcs x23, x23, x27 + sbcs x24, x24, x28 + + adds x8, x8, x3 + adcs x9, x9, x4 + adcs x10, x10, x5 + adcs x11, x11, x6 + adc x12, x12, x7 + b Lbeeu_loop + +Lbeeu_B_greater_than_A: + // Continue with B > A => + // B := B - A; X := X + Y; goto beginning of the loop + mov x25, x14 + mov x26, x15 + mov x27, x19 + mov x28, x20 + + adds x3, x3, x8 + adcs x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + b Lbeeu_loop + +Lbeeu_loop_end: + // The Euclid's algorithm loop ends when A == gcd(a,n); + // this would be 1, when a and n are co-prime (i.e. do not have a common factor). + // Since (-1)*Y*a == A (mod |n|), Y>0 + // then out = -Y mod n + + // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) + // Is A-1 == 0? + // If not, fail. + sub x14, x21, #1 + orr x14, x14, x22 + orr x14, x14, x23 + orr x14, x14, x24 + cbnz x14, Lbeeu_err + + // If Y>n ==> Y:=Y-n +Lbeeu_reduction_loop: + // x_i := y_i - n_i (X is no longer needed, use it as temp) + // (x14 = 0 from above) + subs x3, x8, x0 + sbcs x4, x9, x1 + sbcs x5, x10, x2 + sbcs x6, x11, x30 + sbcs x7, x12, x14 + + // If result is non-negative (i.e., cs = carry set = no borrow), + // y_i := x_i; goto reduce again + // else + // y_i := y_i; continue + csel x8, x3, x8, cs + csel x9, x4, x9, cs + csel x10, x5, x10, cs + csel x11, x6, x11, cs + csel x12, x7, x12, cs + bcs Lbeeu_reduction_loop + + // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0) + // out = -Y = n-Y + subs x8, x0, x8 + sbcs x9, x1, x9 + sbcs x10, x2, x10 + sbcs x11, x30, x11 + + // Save Y in output (out (x0) was saved on the stack) + ldr x3, [sp,#96] + stp x8, x9, [x3] + stp x10, x11, [x3,#16] + // return 1 (success) + mov x0, #1 + b Lbeeu_finish + +Lbeeu_err: + // return 0 (error) + eor x0, x0, x0 + +Lbeeu_finish: + // Restore callee-saved registers, except x0, x2 + add sp,x29,#0 + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldp x29,x30,[sp],#112 + + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-apple.S similarity index 97% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-apple.S index da8a6632..2becb6f0 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -322,7 +321,6 @@ L$beeu_finish: #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-linux.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-linux.S index d6531c7e..23fdd057 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -336,7 +335,6 @@ _CET_ENDBR .size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-apple.S similarity index 90% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-apple.S index 5fad1be8..e18ec29d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -57,7 +56,6 @@ L$err: #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-linux.S similarity index 91% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-linux.S index 9dc67041..28abe4e1 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -57,7 +56,6 @@ _CET_ENDBR .cfi_endproc .size CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-apple.S index 5eedf9cb..297eadc2 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1749,7 +1748,6 @@ L$inc: .p2align 6 .text #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-linux.S index 616aefa8..dfd1858f 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1749,7 +1748,6 @@ _CET_ENDBR .align 64 .text #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha1-586-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-586-apple.S new file mode 100644 index 00000000..7b75e41c --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-586-apple.S @@ -0,0 +1,3787 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _sha1_block_data_order_nohw +.private_extern _sha1_block_data_order_nohw +.align 4 +_sha1_block_data_order_nohw: +L_sha1_block_data_order_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%ebp + movl 24(%esp),%esi + movl 28(%esp),%eax + subl $76,%esp + shll $6,%eax + addl %esi,%eax + movl %eax,104(%esp) + movl 16(%ebp),%edi + jmp L000loop +.align 4,0x90 +L000loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,(%esp) + movl %ebx,4(%esp) + movl %ecx,8(%esp) + movl %edx,12(%esp) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,16(%esp) + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %edx,28(%esp) + movl 32(%esi),%eax + movl 36(%esi),%ebx + movl 40(%esi),%ecx + movl 44(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edx,44(%esp) + movl 48(%esi),%eax + movl 52(%esi),%ebx + movl 56(%esi),%ecx + movl 60(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,48(%esp) + movl %ebx,52(%esp) + movl %ecx,56(%esp) + movl %edx,60(%esp) + movl %esi,100(%esp) + movl (%ebp),%eax + movl 4(%ebp),%ebx + movl 8(%ebp),%ecx + movl 12(%ebp),%edx + # 00_15 0 + movl %ecx,%esi + movl %eax,%ebp + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl (%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + # 00_15 1 + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 4(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + # 00_15 2 + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 8(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + # 00_15 3 + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 12(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + addl %ecx,%ebp + # 00_15 4 + movl %edi,%ebx + movl %ebp,%ecx + roll $5,%ebp + xorl %esi,%ebx + addl %eax,%ebp + movl 16(%esp),%eax + andl %edx,%ebx + rorl $2,%edx + xorl %esi,%ebx + leal 1518500249(%ebp,%eax,1),%ebp + addl %ebx,%ebp + # 00_15 5 + movl %edx,%eax + movl %ebp,%ebx + roll $5,%ebp + xorl %edi,%eax + addl %esi,%ebp + movl 20(%esp),%esi + andl %ecx,%eax + rorl $2,%ecx + xorl %edi,%eax + leal 1518500249(%ebp,%esi,1),%ebp + addl %eax,%ebp + # 00_15 6 + movl %ecx,%esi + movl %ebp,%eax + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl 24(%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + # 00_15 7 + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 28(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + # 00_15 8 + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 32(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + # 00_15 9 + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 36(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + addl %ecx,%ebp + # 00_15 10 + movl %edi,%ebx + movl %ebp,%ecx + roll $5,%ebp + xorl %esi,%ebx + addl %eax,%ebp + movl 40(%esp),%eax + andl %edx,%ebx + rorl $2,%edx + xorl %esi,%ebx + leal 1518500249(%ebp,%eax,1),%ebp + addl %ebx,%ebp + # 00_15 11 + movl %edx,%eax + movl %ebp,%ebx + roll $5,%ebp + xorl %edi,%eax + addl %esi,%ebp + movl 44(%esp),%esi + andl %ecx,%eax + rorl $2,%ecx + xorl %edi,%eax + leal 1518500249(%ebp,%esi,1),%ebp + addl %eax,%ebp + # 00_15 12 + movl %ecx,%esi + movl %ebp,%eax + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl 48(%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + # 00_15 13 + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 52(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + # 00_15 14 + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 56(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + # 00_15 15 + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 60(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + movl (%esp),%ebx + addl %ebp,%ecx + # 16_19 16 + movl %edi,%ebp + xorl 8(%esp),%ebx + xorl %esi,%ebp + xorl 32(%esp),%ebx + andl %edx,%ebp + xorl 52(%esp),%ebx + roll $1,%ebx + xorl %esi,%ebp + addl %ebp,%eax + movl %ecx,%ebp + rorl $2,%edx + movl %ebx,(%esp) + roll $5,%ebp + leal 1518500249(%ebx,%eax,1),%ebx + movl 4(%esp),%eax + addl %ebp,%ebx + # 16_19 17 + movl %edx,%ebp + xorl 12(%esp),%eax + xorl %edi,%ebp + xorl 36(%esp),%eax + andl %ecx,%ebp + xorl 56(%esp),%eax + roll $1,%eax + xorl %edi,%ebp + addl %ebp,%esi + movl %ebx,%ebp + rorl $2,%ecx + movl %eax,4(%esp) + roll $5,%ebp + leal 1518500249(%eax,%esi,1),%eax + movl 8(%esp),%esi + addl %ebp,%eax + # 16_19 18 + movl %ecx,%ebp + xorl 16(%esp),%esi + xorl %edx,%ebp + xorl 40(%esp),%esi + andl %ebx,%ebp + xorl 60(%esp),%esi + roll $1,%esi + xorl %edx,%ebp + addl %ebp,%edi + movl %eax,%ebp + rorl $2,%ebx + movl %esi,8(%esp) + roll $5,%ebp + leal 1518500249(%esi,%edi,1),%esi + movl 12(%esp),%edi + addl %ebp,%esi + # 16_19 19 + movl %ebx,%ebp + xorl 20(%esp),%edi + xorl %ecx,%ebp + xorl 44(%esp),%edi + andl %eax,%ebp + xorl (%esp),%edi + roll $1,%edi + xorl %ecx,%ebp + addl %ebp,%edx + movl %esi,%ebp + rorl $2,%eax + movl %edi,12(%esp) + roll $5,%ebp + leal 1518500249(%edi,%edx,1),%edi + movl 16(%esp),%edx + addl %ebp,%edi + # 20_39 20 + movl %esi,%ebp + xorl 24(%esp),%edx + xorl %eax,%ebp + xorl 48(%esp),%edx + xorl %ebx,%ebp + xorl 4(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,16(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 20(%esp),%ecx + addl %ebp,%edx + # 20_39 21 + movl %edi,%ebp + xorl 28(%esp),%ecx + xorl %esi,%ebp + xorl 52(%esp),%ecx + xorl %eax,%ebp + xorl 8(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,20(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 24(%esp),%ebx + addl %ebp,%ecx + # 20_39 22 + movl %edx,%ebp + xorl 32(%esp),%ebx + xorl %edi,%ebp + xorl 56(%esp),%ebx + xorl %esi,%ebp + xorl 12(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,24(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 28(%esp),%eax + addl %ebp,%ebx + # 20_39 23 + movl %ecx,%ebp + xorl 36(%esp),%eax + xorl %edx,%ebp + xorl 60(%esp),%eax + xorl %edi,%ebp + xorl 16(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,28(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 32(%esp),%esi + addl %ebp,%eax + # 20_39 24 + movl %ebx,%ebp + xorl 40(%esp),%esi + xorl %ecx,%ebp + xorl (%esp),%esi + xorl %edx,%ebp + xorl 20(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,32(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 36(%esp),%edi + addl %ebp,%esi + # 20_39 25 + movl %eax,%ebp + xorl 44(%esp),%edi + xorl %ebx,%ebp + xorl 4(%esp),%edi + xorl %ecx,%ebp + xorl 24(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,36(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl 40(%esp),%edx + addl %ebp,%edi + # 20_39 26 + movl %esi,%ebp + xorl 48(%esp),%edx + xorl %eax,%ebp + xorl 8(%esp),%edx + xorl %ebx,%ebp + xorl 28(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,40(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 44(%esp),%ecx + addl %ebp,%edx + # 20_39 27 + movl %edi,%ebp + xorl 52(%esp),%ecx + xorl %esi,%ebp + xorl 12(%esp),%ecx + xorl %eax,%ebp + xorl 32(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,44(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 48(%esp),%ebx + addl %ebp,%ecx + # 20_39 28 + movl %edx,%ebp + xorl 56(%esp),%ebx + xorl %edi,%ebp + xorl 16(%esp),%ebx + xorl %esi,%ebp + xorl 36(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,48(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 52(%esp),%eax + addl %ebp,%ebx + # 20_39 29 + movl %ecx,%ebp + xorl 60(%esp),%eax + xorl %edx,%ebp + xorl 20(%esp),%eax + xorl %edi,%ebp + xorl 40(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,52(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 56(%esp),%esi + addl %ebp,%eax + # 20_39 30 + movl %ebx,%ebp + xorl (%esp),%esi + xorl %ecx,%ebp + xorl 24(%esp),%esi + xorl %edx,%ebp + xorl 44(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,56(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 60(%esp),%edi + addl %ebp,%esi + # 20_39 31 + movl %eax,%ebp + xorl 4(%esp),%edi + xorl %ebx,%ebp + xorl 28(%esp),%edi + xorl %ecx,%ebp + xorl 48(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,60(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl (%esp),%edx + addl %ebp,%edi + # 20_39 32 + movl %esi,%ebp + xorl 8(%esp),%edx + xorl %eax,%ebp + xorl 32(%esp),%edx + xorl %ebx,%ebp + xorl 52(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 4(%esp),%ecx + addl %ebp,%edx + # 20_39 33 + movl %edi,%ebp + xorl 12(%esp),%ecx + xorl %esi,%ebp + xorl 36(%esp),%ecx + xorl %eax,%ebp + xorl 56(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,4(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 8(%esp),%ebx + addl %ebp,%ecx + # 20_39 34 + movl %edx,%ebp + xorl 16(%esp),%ebx + xorl %edi,%ebp + xorl 40(%esp),%ebx + xorl %esi,%ebp + xorl 60(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,8(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 12(%esp),%eax + addl %ebp,%ebx + # 20_39 35 + movl %ecx,%ebp + xorl 20(%esp),%eax + xorl %edx,%ebp + xorl 44(%esp),%eax + xorl %edi,%ebp + xorl (%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,12(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 16(%esp),%esi + addl %ebp,%eax + # 20_39 36 + movl %ebx,%ebp + xorl 24(%esp),%esi + xorl %ecx,%ebp + xorl 48(%esp),%esi + xorl %edx,%ebp + xorl 4(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,16(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 20(%esp),%edi + addl %ebp,%esi + # 20_39 37 + movl %eax,%ebp + xorl 28(%esp),%edi + xorl %ebx,%ebp + xorl 52(%esp),%edi + xorl %ecx,%ebp + xorl 8(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,20(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl 24(%esp),%edx + addl %ebp,%edi + # 20_39 38 + movl %esi,%ebp + xorl 32(%esp),%edx + xorl %eax,%ebp + xorl 56(%esp),%edx + xorl %ebx,%ebp + xorl 12(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,24(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 28(%esp),%ecx + addl %ebp,%edx + # 20_39 39 + movl %edi,%ebp + xorl 36(%esp),%ecx + xorl %esi,%ebp + xorl 60(%esp),%ecx + xorl %eax,%ebp + xorl 16(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,28(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 32(%esp),%ebx + addl %ebp,%ecx + # 40_59 40 + movl %edi,%ebp + xorl 40(%esp),%ebx + xorl %esi,%ebp + xorl (%esp),%ebx + andl %edx,%ebp + xorl 20(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,32(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 36(%esp),%eax + addl %ebp,%ebx + # 40_59 41 + movl %edx,%ebp + xorl 44(%esp),%eax + xorl %edi,%ebp + xorl 4(%esp),%eax + andl %ecx,%ebp + xorl 24(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,36(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 40(%esp),%esi + addl %ebp,%eax + # 40_59 42 + movl %ecx,%ebp + xorl 48(%esp),%esi + xorl %edx,%ebp + xorl 8(%esp),%esi + andl %ebx,%ebp + xorl 28(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,40(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 44(%esp),%edi + addl %ebp,%esi + # 40_59 43 + movl %ebx,%ebp + xorl 52(%esp),%edi + xorl %ecx,%ebp + xorl 12(%esp),%edi + andl %eax,%ebp + xorl 32(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,44(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 48(%esp),%edx + addl %ebp,%edi + # 40_59 44 + movl %eax,%ebp + xorl 56(%esp),%edx + xorl %ebx,%ebp + xorl 16(%esp),%edx + andl %esi,%ebp + xorl 36(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,48(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 52(%esp),%ecx + addl %ebp,%edx + # 40_59 45 + movl %esi,%ebp + xorl 60(%esp),%ecx + xorl %eax,%ebp + xorl 20(%esp),%ecx + andl %edi,%ebp + xorl 40(%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,52(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 56(%esp),%ebx + addl %ebp,%ecx + # 40_59 46 + movl %edi,%ebp + xorl (%esp),%ebx + xorl %esi,%ebp + xorl 24(%esp),%ebx + andl %edx,%ebp + xorl 44(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,56(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 60(%esp),%eax + addl %ebp,%ebx + # 40_59 47 + movl %edx,%ebp + xorl 4(%esp),%eax + xorl %edi,%ebp + xorl 28(%esp),%eax + andl %ecx,%ebp + xorl 48(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,60(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl (%esp),%esi + addl %ebp,%eax + # 40_59 48 + movl %ecx,%ebp + xorl 8(%esp),%esi + xorl %edx,%ebp + xorl 32(%esp),%esi + andl %ebx,%ebp + xorl 52(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 4(%esp),%edi + addl %ebp,%esi + # 40_59 49 + movl %ebx,%ebp + xorl 12(%esp),%edi + xorl %ecx,%ebp + xorl 36(%esp),%edi + andl %eax,%ebp + xorl 56(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,4(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 8(%esp),%edx + addl %ebp,%edi + # 40_59 50 + movl %eax,%ebp + xorl 16(%esp),%edx + xorl %ebx,%ebp + xorl 40(%esp),%edx + andl %esi,%ebp + xorl 60(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,8(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 12(%esp),%ecx + addl %ebp,%edx + # 40_59 51 + movl %esi,%ebp + xorl 20(%esp),%ecx + xorl %eax,%ebp + xorl 44(%esp),%ecx + andl %edi,%ebp + xorl (%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,12(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 16(%esp),%ebx + addl %ebp,%ecx + # 40_59 52 + movl %edi,%ebp + xorl 24(%esp),%ebx + xorl %esi,%ebp + xorl 48(%esp),%ebx + andl %edx,%ebp + xorl 4(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,16(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 20(%esp),%eax + addl %ebp,%ebx + # 40_59 53 + movl %edx,%ebp + xorl 28(%esp),%eax + xorl %edi,%ebp + xorl 52(%esp),%eax + andl %ecx,%ebp + xorl 8(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,20(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 24(%esp),%esi + addl %ebp,%eax + # 40_59 54 + movl %ecx,%ebp + xorl 32(%esp),%esi + xorl %edx,%ebp + xorl 56(%esp),%esi + andl %ebx,%ebp + xorl 12(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,24(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 28(%esp),%edi + addl %ebp,%esi + # 40_59 55 + movl %ebx,%ebp + xorl 36(%esp),%edi + xorl %ecx,%ebp + xorl 60(%esp),%edi + andl %eax,%ebp + xorl 16(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,28(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 32(%esp),%edx + addl %ebp,%edi + # 40_59 56 + movl %eax,%ebp + xorl 40(%esp),%edx + xorl %ebx,%ebp + xorl (%esp),%edx + andl %esi,%ebp + xorl 20(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,32(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 36(%esp),%ecx + addl %ebp,%edx + # 40_59 57 + movl %esi,%ebp + xorl 44(%esp),%ecx + xorl %eax,%ebp + xorl 4(%esp),%ecx + andl %edi,%ebp + xorl 24(%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,36(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 40(%esp),%ebx + addl %ebp,%ecx + # 40_59 58 + movl %edi,%ebp + xorl 48(%esp),%ebx + xorl %esi,%ebp + xorl 8(%esp),%ebx + andl %edx,%ebp + xorl 28(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,40(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 44(%esp),%eax + addl %ebp,%ebx + # 40_59 59 + movl %edx,%ebp + xorl 52(%esp),%eax + xorl %edi,%ebp + xorl 12(%esp),%eax + andl %ecx,%ebp + xorl 32(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,44(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 48(%esp),%esi + addl %ebp,%eax + # 20_39 60 + movl %ebx,%ebp + xorl 56(%esp),%esi + xorl %ecx,%ebp + xorl 16(%esp),%esi + xorl %edx,%ebp + xorl 36(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,48(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 52(%esp),%edi + addl %ebp,%esi + # 20_39 61 + movl %eax,%ebp + xorl 60(%esp),%edi + xorl %ebx,%ebp + xorl 20(%esp),%edi + xorl %ecx,%ebp + xorl 40(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,52(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 56(%esp),%edx + addl %ebp,%edi + # 20_39 62 + movl %esi,%ebp + xorl (%esp),%edx + xorl %eax,%ebp + xorl 24(%esp),%edx + xorl %ebx,%ebp + xorl 44(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,56(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 60(%esp),%ecx + addl %ebp,%edx + # 20_39 63 + movl %edi,%ebp + xorl 4(%esp),%ecx + xorl %esi,%ebp + xorl 28(%esp),%ecx + xorl %eax,%ebp + xorl 48(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,60(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl (%esp),%ebx + addl %ebp,%ecx + # 20_39 64 + movl %edx,%ebp + xorl 8(%esp),%ebx + xorl %edi,%ebp + xorl 32(%esp),%ebx + xorl %esi,%ebp + xorl 52(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 4(%esp),%eax + addl %ebp,%ebx + # 20_39 65 + movl %ecx,%ebp + xorl 12(%esp),%eax + xorl %edx,%ebp + xorl 36(%esp),%eax + xorl %edi,%ebp + xorl 56(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,4(%esp) + leal 3395469782(%eax,%esi,1),%eax + movl 8(%esp),%esi + addl %ebp,%eax + # 20_39 66 + movl %ebx,%ebp + xorl 16(%esp),%esi + xorl %ecx,%ebp + xorl 40(%esp),%esi + xorl %edx,%ebp + xorl 60(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,8(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 12(%esp),%edi + addl %ebp,%esi + # 20_39 67 + movl %eax,%ebp + xorl 20(%esp),%edi + xorl %ebx,%ebp + xorl 44(%esp),%edi + xorl %ecx,%ebp + xorl (%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,12(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 16(%esp),%edx + addl %ebp,%edi + # 20_39 68 + movl %esi,%ebp + xorl 24(%esp),%edx + xorl %eax,%ebp + xorl 48(%esp),%edx + xorl %ebx,%ebp + xorl 4(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,16(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 20(%esp),%ecx + addl %ebp,%edx + # 20_39 69 + movl %edi,%ebp + xorl 28(%esp),%ecx + xorl %esi,%ebp + xorl 52(%esp),%ecx + xorl %eax,%ebp + xorl 8(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,20(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl 24(%esp),%ebx + addl %ebp,%ecx + # 20_39 70 + movl %edx,%ebp + xorl 32(%esp),%ebx + xorl %edi,%ebp + xorl 56(%esp),%ebx + xorl %esi,%ebp + xorl 12(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,24(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 28(%esp),%eax + addl %ebp,%ebx + # 20_39 71 + movl %ecx,%ebp + xorl 36(%esp),%eax + xorl %edx,%ebp + xorl 60(%esp),%eax + xorl %edi,%ebp + xorl 16(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,28(%esp) + leal 3395469782(%eax,%esi,1),%eax + movl 32(%esp),%esi + addl %ebp,%eax + # 20_39 72 + movl %ebx,%ebp + xorl 40(%esp),%esi + xorl %ecx,%ebp + xorl (%esp),%esi + xorl %edx,%ebp + xorl 20(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,32(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 36(%esp),%edi + addl %ebp,%esi + # 20_39 73 + movl %eax,%ebp + xorl 44(%esp),%edi + xorl %ebx,%ebp + xorl 4(%esp),%edi + xorl %ecx,%ebp + xorl 24(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,36(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 40(%esp),%edx + addl %ebp,%edi + # 20_39 74 + movl %esi,%ebp + xorl 48(%esp),%edx + xorl %eax,%ebp + xorl 8(%esp),%edx + xorl %ebx,%ebp + xorl 28(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,40(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 44(%esp),%ecx + addl %ebp,%edx + # 20_39 75 + movl %edi,%ebp + xorl 52(%esp),%ecx + xorl %esi,%ebp + xorl 12(%esp),%ecx + xorl %eax,%ebp + xorl 32(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,44(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl 48(%esp),%ebx + addl %ebp,%ecx + # 20_39 76 + movl %edx,%ebp + xorl 56(%esp),%ebx + xorl %edi,%ebp + xorl 16(%esp),%ebx + xorl %esi,%ebp + xorl 36(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,48(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 52(%esp),%eax + addl %ebp,%ebx + # 20_39 77 + movl %ecx,%ebp + xorl 60(%esp),%eax + xorl %edx,%ebp + xorl 20(%esp),%eax + xorl %edi,%ebp + xorl 40(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + leal 3395469782(%eax,%esi,1),%eax + movl 56(%esp),%esi + addl %ebp,%eax + # 20_39 78 + movl %ebx,%ebp + xorl (%esp),%esi + xorl %ecx,%ebp + xorl 24(%esp),%esi + xorl %edx,%ebp + xorl 44(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + leal 3395469782(%esi,%edi,1),%esi + movl 60(%esp),%edi + addl %ebp,%esi + # 20_39 79 + movl %eax,%ebp + xorl 4(%esp),%edi + xorl %ebx,%ebp + xorl 28(%esp),%edi + xorl %ecx,%ebp + xorl 48(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + leal 3395469782(%edi,%edx,1),%edi + addl %ebp,%edi + movl 96(%esp),%ebp + movl 100(%esp),%edx + addl (%ebp),%edi + addl 4(%ebp),%esi + addl 8(%ebp),%eax + addl 12(%ebp),%ebx + addl 16(%ebp),%ecx + movl %edi,(%ebp) + addl $64,%edx + movl %esi,4(%ebp) + cmpl 104(%esp),%edx + movl %eax,8(%ebp) + movl %ecx,%edi + movl %ebx,12(%ebp) + movl %edx,%esi + movl %ecx,16(%ebp) + jb L000loop + addl $76,%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _sha1_block_data_order_ssse3 +.private_extern _sha1_block_data_order_ssse3 +.align 4 +_sha1_block_data_order_ssse3: +L_sha1_block_data_order_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call L001pic_point +L001pic_point: + popl %ebp + leal LK_XX_XX-L001pic_point(%ebp),%ebp + movdqa (%ebp),%xmm7 + movdqa 16(%ebp),%xmm0 + movdqa 32(%ebp),%xmm1 + movdqa 48(%ebp),%xmm2 + movdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + movdqa %xmm0,112(%esp) + movdqa %xmm1,128(%esp) + movdqa %xmm2,144(%esp) + shll $6,%edx + movdqa %xmm7,160(%esp) + addl %ebp,%edx + movdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + movdqu -64(%ebp),%xmm0 + movdqu -48(%ebp),%xmm1 + movdqu -32(%ebp),%xmm2 + movdqu -16(%ebp),%xmm3 +.byte 102,15,56,0,198 +.byte 102,15,56,0,206 +.byte 102,15,56,0,214 + movdqa %xmm7,96(%esp) +.byte 102,15,56,0,222 + paddd %xmm7,%xmm0 + paddd %xmm7,%xmm1 + paddd %xmm7,%xmm2 + movdqa %xmm0,(%esp) + psubd %xmm7,%xmm0 + movdqa %xmm1,16(%esp) + psubd %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + movl %ecx,%ebp + psubd %xmm7,%xmm2 + xorl %edx,%ebp + pshufd $238,%xmm0,%xmm4 + andl %ebp,%esi + jmp L002loop +.align 4,0x90 +L002loop: + rorl $2,%ebx + xorl %edx,%esi + movl %eax,%ebp + punpcklqdq %xmm1,%xmm4 + movdqa %xmm3,%xmm6 + addl (%esp),%edi + xorl %ecx,%ebx + paddd %xmm3,%xmm7 + movdqa %xmm0,64(%esp) + roll $5,%eax + addl %esi,%edi + psrldq $4,%xmm6 + andl %ebx,%ebp + xorl %ecx,%ebx + pxor %xmm0,%xmm4 + addl %eax,%edi + rorl $7,%eax + pxor %xmm2,%xmm6 + xorl %ecx,%ebp + movl %edi,%esi + addl 4(%esp),%edx + pxor %xmm6,%xmm4 + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm7,48(%esp) + addl %ebp,%edx + andl %eax,%esi + movdqa %xmm4,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + movdqa %xmm4,%xmm6 + xorl %ebx,%esi + pslldq $12,%xmm0 + paddd %xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + psrld $31,%xmm6 + xorl %eax,%edi + roll $5,%edx + movdqa %xmm0,%xmm7 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + psrld $30,%xmm0 + addl %edx,%ecx + rorl $7,%edx + por %xmm6,%xmm4 + xorl %eax,%ebp + movl %ecx,%esi + addl 12(%esp),%ebx + pslld $2,%xmm7 + xorl %edi,%edx + roll $5,%ecx + pxor %xmm0,%xmm4 + movdqa 96(%esp),%xmm0 + addl %ebp,%ebx + andl %edx,%esi + pxor %xmm7,%xmm4 + pshufd $238,%xmm1,%xmm5 + xorl %edi,%edx + addl %ecx,%ebx + rorl $7,%ecx + xorl %edi,%esi + movl %ebx,%ebp + punpcklqdq %xmm2,%xmm5 + movdqa %xmm4,%xmm7 + addl 16(%esp),%eax + xorl %edx,%ecx + paddd %xmm4,%xmm0 + movdqa %xmm1,80(%esp) + roll $5,%ebx + addl %esi,%eax + psrldq $4,%xmm7 + andl %ecx,%ebp + xorl %edx,%ecx + pxor %xmm1,%xmm5 + addl %ebx,%eax + rorl $7,%ebx + pxor %xmm3,%xmm7 + xorl %edx,%ebp + movl %eax,%esi + addl 20(%esp),%edi + pxor %xmm7,%xmm5 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm0,(%esp) + addl %ebp,%edi + andl %ebx,%esi + movdqa %xmm5,%xmm1 + xorl %ecx,%ebx + addl %eax,%edi + rorl $7,%eax + movdqa %xmm5,%xmm7 + xorl %ecx,%esi + pslldq $12,%xmm1 + paddd %xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + psrld $31,%xmm7 + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm1,%xmm0 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + psrld $30,%xmm1 + addl %edi,%edx + rorl $7,%edi + por %xmm7,%xmm5 + xorl %ebx,%ebp + movl %edx,%esi + addl 28(%esp),%ecx + pslld $2,%xmm0 + xorl %eax,%edi + roll $5,%edx + pxor %xmm1,%xmm5 + movdqa 112(%esp),%xmm1 + addl %ebp,%ecx + andl %edi,%esi + pxor %xmm0,%xmm5 + pshufd $238,%xmm2,%xmm6 + xorl %eax,%edi + addl %edx,%ecx + rorl $7,%edx + xorl %eax,%esi + movl %ecx,%ebp + punpcklqdq %xmm3,%xmm6 + movdqa %xmm5,%xmm0 + addl 32(%esp),%ebx + xorl %edi,%edx + paddd %xmm5,%xmm1 + movdqa %xmm2,96(%esp) + roll $5,%ecx + addl %esi,%ebx + psrldq $4,%xmm0 + andl %edx,%ebp + xorl %edi,%edx + pxor %xmm2,%xmm6 + addl %ecx,%ebx + rorl $7,%ecx + pxor %xmm4,%xmm0 + xorl %edi,%ebp + movl %ebx,%esi + addl 36(%esp),%eax + pxor %xmm0,%xmm6 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm1,16(%esp) + addl %ebp,%eax + andl %ecx,%esi + movdqa %xmm6,%xmm2 + xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm0 + xorl %edx,%esi + pslldq $12,%xmm2 + paddd %xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + psrld $31,%xmm0 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm2,%xmm1 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + psrld $30,%xmm2 + addl %eax,%edi + rorl $7,%eax + por %xmm0,%xmm6 + xorl %ecx,%ebp + movdqa 64(%esp),%xmm0 + movl %edi,%esi + addl 44(%esp),%edx + pslld $2,%xmm1 + xorl %ebx,%eax + roll $5,%edi + pxor %xmm2,%xmm6 + movdqa 112(%esp),%xmm2 + addl %ebp,%edx + andl %eax,%esi + pxor %xmm1,%xmm6 + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + xorl %ebx,%esi + movl %edx,%ebp + punpcklqdq %xmm4,%xmm7 + movdqa %xmm6,%xmm1 + addl 48(%esp),%ecx + xorl %eax,%edi + paddd %xmm6,%xmm2 + movdqa %xmm3,64(%esp) + roll $5,%edx + addl %esi,%ecx + psrldq $4,%xmm1 + andl %edi,%ebp + xorl %eax,%edi + pxor %xmm3,%xmm7 + addl %edx,%ecx + rorl $7,%edx + pxor %xmm5,%xmm1 + xorl %eax,%ebp + movl %ecx,%esi + addl 52(%esp),%ebx + pxor %xmm1,%xmm7 + xorl %edi,%edx + roll $5,%ecx + movdqa %xmm2,32(%esp) + addl %ebp,%ebx + andl %edx,%esi + movdqa %xmm7,%xmm3 + xorl %edi,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm1 + xorl %edi,%esi + pslldq $12,%xmm3 + paddd %xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + psrld $31,%xmm1 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm3,%xmm2 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + psrld $30,%xmm3 + addl %ebx,%eax + rorl $7,%ebx + por %xmm1,%xmm7 + xorl %edx,%ebp + movdqa 80(%esp),%xmm1 + movl %eax,%esi + addl 60(%esp),%edi + pslld $2,%xmm2 + xorl %ecx,%ebx + roll $5,%eax + pxor %xmm3,%xmm7 + movdqa 112(%esp),%xmm3 + addl %ebp,%edi + andl %ebx,%esi + pxor %xmm2,%xmm7 + pshufd $238,%xmm6,%xmm2 + xorl %ecx,%ebx + addl %eax,%edi + rorl $7,%eax + pxor %xmm4,%xmm0 + punpcklqdq %xmm7,%xmm2 + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + pxor %xmm1,%xmm0 + movdqa %xmm4,80(%esp) + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm3,%xmm4 + addl %esi,%edx + paddd %xmm7,%xmm3 + andl %eax,%ebp + pxor %xmm2,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + xorl %ebx,%ebp + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + roll $5,%edx + pslld $2,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + psrld $30,%xmm2 + xorl %eax,%edi + addl %edx,%ecx + rorl $7,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + xorl %edi,%edx + roll $5,%ecx + por %xmm2,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + movdqa 96(%esp),%xmm2 + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + pshufd $238,%xmm7,%xmm3 + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 16(%esp),%edi + pxor %xmm5,%xmm1 + punpcklqdq %xmm0,%xmm3 + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm2,%xmm1 + movdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + movdqa %xmm4,%xmm5 + rorl $7,%ebx + paddd %xmm0,%xmm4 + addl %eax,%edi + pxor %xmm3,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + pslld $2,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + psrld $30,%xmm3 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + por %xmm3,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + movdqa 64(%esp),%xmm3 + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + pshufd $238,%xmm0,%xmm4 + addl %ecx,%ebx + addl 32(%esp),%eax + pxor %xmm6,%xmm2 + punpcklqdq %xmm1,%xmm4 + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + pxor %xmm3,%xmm2 + movdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + movdqa 128(%esp),%xmm6 + rorl $7,%ecx + paddd %xmm1,%xmm5 + addl %ebx,%eax + pxor %xmm4,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + pslld $2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + psrld $30,%xmm4 + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + por %xmm4,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + movdqa 80(%esp),%xmm4 + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + pshufd $238,%xmm1,%xmm5 + addl %edx,%ecx + addl 48(%esp),%ebx + pxor %xmm7,%xmm3 + punpcklqdq %xmm2,%xmm5 + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + pxor %xmm4,%xmm3 + movdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + movdqa %xmm6,%xmm7 + rorl $7,%edx + paddd %xmm2,%xmm6 + addl %ecx,%ebx + pxor %xmm5,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pslld $2,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + psrld $30,%xmm5 + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + por %xmm5,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + movdqa 96(%esp),%xmm5 + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + pshufd $238,%xmm2,%xmm6 + addl %edi,%edx + addl (%esp),%ecx + pxor %xmm0,%xmm4 + punpcklqdq %xmm3,%xmm6 + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + pxor %xmm5,%xmm4 + movdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + movdqa %xmm7,%xmm0 + rorl $7,%edi + paddd %xmm3,%xmm7 + addl %edx,%ecx + pxor %xmm6,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + movdqa %xmm4,%xmm6 + movdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + pslld $2,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + psrld $30,%xmm6 + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + por %xmm6,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + movdqa 64(%esp),%xmm6 + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + pshufd $238,%xmm3,%xmm7 + addl %eax,%edi + addl 16(%esp),%edx + pxor %xmm1,%xmm5 + punpcklqdq %xmm4,%xmm7 + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + pxor %xmm6,%xmm5 + movdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + movdqa %xmm0,%xmm1 + rorl $7,%eax + paddd %xmm4,%xmm0 + addl %edi,%edx + pxor %xmm7,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + movdqa %xmm5,%xmm7 + movdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + pslld $2,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + psrld $30,%xmm7 + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + por %xmm7,%xmm5 + addl 28(%esp),%eax + movdqa 80(%esp),%xmm7 + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + pshufd $238,%xmm4,%xmm0 + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 32(%esp),%edi + pxor %xmm2,%xmm6 + punpcklqdq %xmm5,%xmm0 + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + pxor %xmm7,%xmm6 + movdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + roll $5,%eax + movdqa %xmm1,%xmm2 + addl %esi,%edi + paddd %xmm5,%xmm1 + xorl %ebx,%ebp + pxor %xmm0,%xmm6 + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + andl %ebx,%ebp + movdqa %xmm6,%xmm0 + movdqa %xmm1,16(%esp) + xorl %ecx,%ebx + rorl $7,%eax + movl %edi,%esi + xorl %ebx,%ebp + roll $5,%edi + pslld $2,%xmm6 + addl %ebp,%edx + xorl %eax,%esi + psrld $30,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%edi + por %xmm0,%xmm6 + movl %edx,%ebp + xorl %eax,%esi + movdqa 96(%esp),%xmm0 + roll $5,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + pshufd $238,%xmm5,%xmm1 + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + rorl $7,%edx + movl %ecx,%esi + xorl %edi,%ebp + roll $5,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 48(%esp),%eax + pxor %xmm3,%xmm7 + punpcklqdq %xmm6,%xmm1 + andl %edx,%esi + xorl %edi,%edx + rorl $7,%ecx + pxor %xmm0,%xmm7 + movdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + roll $5,%ebx + movdqa 144(%esp),%xmm3 + addl %esi,%eax + paddd %xmm6,%xmm2 + xorl %ecx,%ebp + pxor %xmm1,%xmm7 + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + andl %ecx,%ebp + movdqa %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%ebp + roll $5,%eax + pslld $2,%xmm7 + addl %ebp,%edi + xorl %ebx,%esi + psrld $30,%xmm1 + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + por %xmm1,%xmm7 + movl %edi,%ebp + xorl %ebx,%esi + movdqa 64(%esp),%xmm1 + roll $5,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + pshufd $238,%xmm6,%xmm2 + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + rorl $7,%edi + movl %edx,%esi + xorl %eax,%ebp + roll $5,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl (%esp),%ebx + pxor %xmm4,%xmm0 + punpcklqdq %xmm7,%xmm2 + andl %edi,%esi + xorl %eax,%edi + rorl $7,%edx + pxor %xmm1,%xmm0 + movdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + roll $5,%ecx + movdqa %xmm3,%xmm4 + addl %esi,%ebx + paddd %xmm7,%xmm3 + xorl %edx,%ebp + pxor %xmm2,%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + andl %edx,%ebp + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + xorl %edi,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + pslld $2,%xmm0 + addl %ebp,%eax + xorl %ecx,%esi + psrld $30,%xmm2 + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + por %xmm2,%xmm0 + movl %eax,%ebp + xorl %ecx,%esi + movdqa 80(%esp),%xmm2 + roll $5,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + pshufd $238,%xmm7,%xmm3 + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + rorl $7,%eax + movl %edi,%esi + xorl %ebx,%ebp + roll $5,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 16(%esp),%ecx + pxor %xmm5,%xmm1 + punpcklqdq %xmm0,%xmm3 + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%edi + pxor %xmm2,%xmm1 + movdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + roll $5,%edx + movdqa %xmm4,%xmm5 + addl %esi,%ecx + paddd %xmm0,%xmm4 + xorl %edi,%ebp + pxor %xmm3,%xmm1 + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + andl %edi,%ebp + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + xorl %eax,%edi + rorl $7,%edx + movl %ecx,%esi + xorl %edi,%ebp + roll $5,%ecx + pslld $2,%xmm1 + addl %ebp,%ebx + xorl %edx,%esi + psrld $30,%xmm3 + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + rorl $7,%ecx + por %xmm3,%xmm1 + movl %ebx,%ebp + xorl %edx,%esi + movdqa 96(%esp),%xmm3 + roll $5,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + pshufd $238,%xmm0,%xmm4 + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%ebp + roll $5,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 32(%esp),%edx + pxor %xmm6,%xmm2 + punpcklqdq %xmm1,%xmm4 + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + pxor %xmm3,%xmm2 + movdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + roll $5,%edi + movdqa %xmm5,%xmm6 + addl %esi,%edx + paddd %xmm1,%xmm5 + xorl %eax,%ebp + pxor %xmm4,%xmm2 + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + andl %eax,%ebp + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + xorl %ebx,%eax + rorl $7,%edi + movl %edx,%esi + xorl %eax,%ebp + roll $5,%edx + pslld $2,%xmm2 + addl %ebp,%ecx + xorl %edi,%esi + psrld $30,%xmm4 + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + rorl $7,%edx + por %xmm4,%xmm2 + movl %ecx,%ebp + xorl %edi,%esi + movdqa 64(%esp),%xmm4 + roll $5,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + pshufd $238,%xmm1,%xmm5 + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + addl 48(%esp),%edi + pxor %xmm7,%xmm3 + punpcklqdq %xmm2,%xmm5 + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm4,%xmm3 + movdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + movdqa %xmm6,%xmm7 + rorl $7,%ebx + paddd %xmm2,%xmm6 + addl %eax,%edi + pxor %xmm5,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + pslld $2,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + psrld $30,%xmm5 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + por %xmm5,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + addl (%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + paddd %xmm3,%xmm7 + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + movdqa %xmm7,48(%esp) + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je L003done + movdqa 160(%esp),%xmm7 + movdqa 176(%esp),%xmm6 + movdqu (%ebp),%xmm0 + movdqu 16(%ebp),%xmm1 + movdqu 32(%ebp),%xmm2 + movdqu 48(%ebp),%xmm3 + addl $64,%ebp +.byte 102,15,56,0,198 + movl %ebp,196(%esp) + movdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx +.byte 102,15,56,0,206 + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + paddd %xmm7,%xmm0 + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + movdqa %xmm0,(%esp) + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + psubd %xmm7,%xmm0 + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi +.byte 102,15,56,0,214 + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + paddd %xmm7,%xmm1 + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + movdqa %xmm1,16(%esp) + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + psubd %xmm7,%xmm1 + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax +.byte 102,15,56,0,222 + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + paddd %xmm7,%xmm2 + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + movdqa %xmm2,32(%esp) + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + psubd %xmm7,%xmm2 + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + rorl $7,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %ecx,%ebx + movl %edx,12(%ebp) + xorl %edx,%ebx + movl %edi,16(%ebp) + movl %esi,%ebp + pshufd $238,%xmm0,%xmm4 + andl %ebx,%esi + movl %ebp,%ebx + jmp L002loop +.align 4,0x90 +L003done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + rorl $7,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _sha1_block_data_order_avx +.private_extern _sha1_block_data_order_avx +.align 4 +_sha1_block_data_order_avx: +L_sha1_block_data_order_avx_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call L004pic_point +L004pic_point: + popl %ebp + leal LK_XX_XX-L004pic_point(%ebp),%ebp + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) + shll $6,%edx + vmovdqa %xmm7,160(%esp) + addl %ebp,%edx + vmovdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) + movl %ecx,%ebp + vmovdqa %xmm5,16(%esp) + xorl %edx,%ebp + vmovdqa %xmm6,32(%esp) + andl %ebp,%esi + jmp L005loop +.align 4,0x90 +L005loop: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%ebp + addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%edi + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) + movl %edi,%esi + addl 4(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%edi,%edi + addl %ebp,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm6 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 + movl %ecx,%esi + addl 12(%esp),%ebx + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 + addl %ebp,%ebx + andl %edx,%esi + vmovdqa 96(%esp),%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%ebp + addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vmovdqa %xmm0,(%esp) + movl %eax,%esi + addl 20(%esp),%edi + vpxor %xmm7,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %ebp,%edi + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm7 + xorl %ecx,%ebx + addl %eax,%edi + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 + movl %edx,%esi + addl 28(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 + addl %ebp,%ecx + andl %edi,%esi + vmovdqa 112(%esp),%xmm1 + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%ebp + addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 + xorl %edi,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx + xorl %edi,%ebp + vmovdqa %xmm1,16(%esp) + movl %ebx,%esi + addl 36(%esp),%eax + vpxor %xmm0,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + addl %ebp,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vpxor %xmm1,%xmm6,%xmm6 + movl %edi,%esi + addl 44(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 + addl %ebp,%edx + andl %eax,%esi + vmovdqa 112(%esp),%xmm2 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%ebp + addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 + addl %esi,%ecx + andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%edi + addl %edx,%ecx + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vmovdqa %xmm2,32(%esp) + movl %ecx,%esi + addl 52(%esp),%ebx + vpxor %xmm1,%xmm7,%xmm7 + xorl %edi,%edx + shldl $5,%ecx,%ecx + addl %ebp,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm1 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vpxor %xmm2,%xmm7,%xmm7 + movl %eax,%esi + addl 60(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 + addl %ebp,%edi + andl %ebx,%esi + vmovdqa 112(%esp),%xmm3 + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) + xorl %ebx,%eax + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + addl %esi,%edx + andl %eax,%ebp + vpxor %xmm2,%xmm0,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 + xorl %edi,%edx + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 + addl %esi,%ebx + andl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm3,%xmm1,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm3,%xmm1,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + vmovdqa 64(%esp),%xmm3 + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm4,%xmm2,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vpor %xmm4,%xmm2,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + vmovdqa 80(%esp),%xmm4 + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + vmovdqa 96(%esp),%xmm5 + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 + addl (%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi + addl %edx,%ecx + vpxor %xmm6,%xmm4,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm6,%xmm4,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + vmovdqa 64(%esp),%xmm6 + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + addl %edi,%edx + vpxor %xmm7,%xmm5,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm7,%xmm5,%xmm5 + addl 28(%esp),%eax + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax + addl %esi,%edi + vpxor %xmm0,%xmm6,%xmm6 + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 + movl %edx,%ebp + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 + movl %edi,%ebp + xorl %ebx,%esi + shldl $5,%edi,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + addl (%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm2,%xmm0,%xmm0 + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 + movl %eax,%ebp + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm3,%xmm1,%xmm1 + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 + movl %ebx,%ebp + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi + addl %esi,%edx + vpxor %xmm4,%xmm2,%xmm2 + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 + xorl %eax,%edi + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 + movl %ecx,%ebp + xorl %edi,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm7,48(%esp) + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je L006done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 + addl $64,%ebp + vpshufb %xmm6,%xmm0,%xmm0 + movl %ebp,196(%esp) + vmovdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,(%esp) + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%ebp + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vmovdqa %xmm5,16(%esp) + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %edi,%ebp + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vmovdqa %xmm6,32(%esp) + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,%ebx + movl %ecx,8(%ebp) + xorl %edx,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movl %esi,%ebp + andl %ebx,%esi + movl %ebp,%ebx + jmp L005loop +.align 4,0x90 +L006done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroall + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +LK_XX_XX: +.long 1518500249,1518500249,1518500249,1518500249 +.long 1859775393,1859775393,1859775393,1859775393 +.long 2400959708,2400959708,2400959708,2400959708 +.long 3395469782,3395469782,3395469782,3395469782 +.long 66051,67438087,134810123,202182159 +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 +.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 +.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 +.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-586-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-586-linux.S index 11b6cb2f..0778a6fc 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-586-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -3788,7 +3787,6 @@ sha1_block_data_order_avx: .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv4-large-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-linux.linux.arm.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-armv4-large-linux.S index f9bb4730..aea08091 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-linux.linux.arm.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv4-large-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1481,7 +1480,6 @@ sha1_block_data_order_hw: .size sha1_block_data_order_hw,.-sha1_block_data_order_hw #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) -#endif // defined(__arm__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-apple.S index 21c39904..ea3e5513 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1218,7 +1217,6 @@ Lconst: .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-linux.S index 296b18c6..4afb8cfd 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1218,7 +1217,6 @@ sha1_block_data_order_hw: .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-win.S new file mode 100644 index 00000000..5635fdee --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-win.S @@ -0,0 +1,1227 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include + +.text + +.globl sha1_block_data_order_nohw + +.def sha1_block_data_order_nohw + .type 32 +.endef +.align 6 +sha1_block_data_order_nohw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp w20,w21,[x0] + ldp w22,w23,[x0,#8] + ldr w24,[x0,#16] + +Loop: + ldr x3,[x1],#64 + movz w28,#0x7999 + sub x2,x2,#1 + movk w28,#0x5a82,lsl#16 +#ifdef __AARCH64EB__ + ror x3,x3,#32 +#else + rev32 x3,x3 +#endif + add w24,w24,w28 // warm it up + add w24,w24,w3 + lsr x4,x3,#32 + ldr x5,[x1,#-56] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w4 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x5,x5,#32 +#else + rev32 x5,x5 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w5 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x6,x5,#32 + ldr x7,[x1,#-48] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w6 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x7,x7,#32 +#else + rev32 x7,x7 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w7 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x8,x7,#32 + ldr x9,[x1,#-40] + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w8 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x9,x9,#32 +#else + rev32 x9,x9 +#endif + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w9 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + lsr x10,x9,#32 + ldr x11,[x1,#-32] + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w10 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x11,x11,#32 +#else + rev32 x11,x11 +#endif + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w11 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + lsr x12,x11,#32 + ldr x13,[x1,#-24] + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w12 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x13,x13,#32 +#else + rev32 x13,x13 +#endif + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w13 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + lsr x14,x13,#32 + ldr x15,[x1,#-16] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w14 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x15,x15,#32 +#else + rev32 x15,x15 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w15 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x16,x15,#32 + ldr x17,[x1,#-8] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w16 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x17,x17,#32 +#else + rev32 x17,x17 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w17 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x19,x17,#32 + eor w3,w3,w5 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w3,w3,w11 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w3,w3,w16 + ror w22,w22,#2 + add w24,w24,w19 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + eor w4,w4,w12 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + eor w4,w4,w17 + ror w21,w21,#2 + add w23,w23,w3 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + eor w5,w5,w13 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + eor w5,w5,w19 + ror w20,w20,#2 + add w22,w22,w4 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + eor w6,w6,w14 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + eor w6,w6,w3 + ror w24,w24,#2 + add w21,w21,w5 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + eor w7,w7,w15 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + eor w7,w7,w4 + ror w23,w23,#2 + add w20,w20,w6 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w7,w7,#31 + movz w28,#0xeba1 + movk w28,#0x6ed9,lsl#16 + eor w8,w8,w10 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w8,w8,w16 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w8,w8,w5 + ror w22,w22,#2 + add w24,w24,w7 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w9,w9,w6 + add w23,w23,w8 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w10,w10,w7 + add w22,w22,w9 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w11,w11,w8 + add w21,w21,w10 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w12,w12,w9 + add w20,w20,w11 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w13,w13,w10 + add w24,w24,w12 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w14,w14,w11 + add w23,w23,w13 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w15,w15,w12 + add w22,w22,w14 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w16,w16,w13 + add w21,w21,w15 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w17,w17,w14 + add w20,w20,w16 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w19,w19,w15 + add w24,w24,w17 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w3,w3,w16 + add w23,w23,w19 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w4,w4,w17 + add w22,w22,w3 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w5,w5,w19 + add w21,w21,w4 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w6,w6,w3 + add w20,w20,w5 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w7,w7,w4 + add w24,w24,w6 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w8,w8,w5 + add w23,w23,w7 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w9,w9,w6 + add w22,w22,w8 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w10,w10,w7 + add w21,w21,w9 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w11,w11,w8 + add w20,w20,w10 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w11,w11,#31 + movz w28,#0xbcdc + movk w28,#0x8f1b,lsl#16 + eor w12,w12,w14 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w12,w12,w9 + add w24,w24,w11 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w13,w13,w15 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w13,w13,w5 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w13,w13,w10 + add w23,w23,w12 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w14,w14,w16 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w14,w14,w6 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w14,w14,w11 + add w22,w22,w13 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w15,w15,w17 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w15,w15,w7 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w15,w15,w12 + add w21,w21,w14 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w15,w15,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w16,w16,w19 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w16,w16,w8 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w16,w16,w13 + add w20,w20,w15 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w16,w16,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w17,w17,w3 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w17,w17,w9 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w17,w17,w14 + add w24,w24,w16 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w17,w17,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w19,w19,w4 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w19,w19,w10 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w19,w19,w15 + add w23,w23,w17 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w19,w19,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w3,w3,w5 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w3,w3,w11 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w3,w3,w16 + add w22,w22,w19 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w3,w3,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w4,w4,w6 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w4,w4,w12 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w4,w4,w17 + add w21,w21,w3 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w4,w4,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w5,w5,w7 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w5,w5,w13 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w5,w5,w19 + add w20,w20,w4 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w5,w5,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w6,w6,w8 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w6,w6,w14 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w6,w6,w3 + add w24,w24,w5 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w6,w6,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w7,w7,w9 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w7,w7,w15 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w7,w7,w4 + add w23,w23,w6 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w7,w7,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w8,w8,w10 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w8,w8,w16 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w8,w8,w5 + add w22,w22,w7 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w8,w8,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w9,w9,w11 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w9,w9,w17 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w9,w9,w6 + add w21,w21,w8 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w9,w9,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w10,w10,w12 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w10,w10,w19 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w10,w10,w7 + add w20,w20,w9 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w10,w10,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w11,w11,w13 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w11,w11,w3 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w11,w11,w8 + add w24,w24,w10 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w11,w11,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w12,w12,w14 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w12,w12,w4 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w12,w12,w9 + add w23,w23,w11 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w13,w13,w15 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w13,w13,w5 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w13,w13,w10 + add w22,w22,w12 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w14,w14,w16 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w14,w14,w6 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w14,w14,w11 + add w21,w21,w13 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w15,w15,w17 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w15,w15,w7 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w15,w15,w12 + add w20,w20,w14 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w15,w15,#31 + movz w28,#0xc1d6 + movk w28,#0xca62,lsl#16 + orr w25,w22,w23 + and w26,w22,w23 + eor w16,w16,w19 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w16,w16,w8 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w16,w16,w13 + add w24,w24,w15 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w17,w17,w14 + add w23,w23,w16 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w19,w19,w15 + add w22,w22,w17 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w3,w3,w16 + add w21,w21,w19 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w4,w4,w17 + add w20,w20,w3 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w5,w5,w19 + add w24,w24,w4 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w6,w6,w3 + add w23,w23,w5 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w7,w7,w4 + add w22,w22,w6 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w8,w8,w5 + add w21,w21,w7 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w9,w9,w6 + add w20,w20,w8 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w10,w10,w7 + add w24,w24,w9 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w11,w11,w8 + add w23,w23,w10 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w12,w12,w9 + add w22,w22,w11 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w13,w13,w10 + add w21,w21,w12 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w14,w14,w11 + add w20,w20,w13 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w15,w15,w12 + add w24,w24,w14 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w16,w16,w13 + add w23,w23,w15 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w17,w17,w14 + add w22,w22,w16 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w19,w19,w15 + add w21,w21,w17 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w19,w19,#31 + ldp w4,w5,[x0] + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w19 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ldp w6,w7,[x0,#8] + eor w25,w24,w22 + ror w27,w21,#27 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + ldr w8,[x0,#16] + add w20,w20,w25 // e+=F(b,c,d) + add w21,w21,w5 + add w22,w22,w6 + add w20,w20,w4 + add w23,w23,w7 + add w24,w24,w8 + stp w20,w21,[x0] + stp w22,w23,[x0,#8] + str w24,[x0,#16] + cbnz x2,Loop + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldr x29,[sp],#96 + ret + +.globl sha1_block_data_order_hw + +.def sha1_block_data_order_hw + .type 32 +.endef +.align 6 +sha1_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + adrp x4,Lconst + add x4,x4,:lo12:Lconst + eor v1.16b,v1.16b,v1.16b + ld1 {v0.4s},[x0],#16 + ld1 {v1.s}[0],[x0] + sub x0,x0,#16 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4] + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + + add v20.4s,v16.4s,v4.4s + rev32 v6.16b,v6.16b + orr v22.16b,v0.16b,v0.16b // offload + + add v21.4s,v16.4s,v5.4s + rev32 v7.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b +.long 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 + add v20.4s,v16.4s,v6.4s +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 1 +.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v16.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 2 +.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v16.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 3 +.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 4 +.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 5 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 6 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 7 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 8 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 9 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 10 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 11 +.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 12 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 13 +.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 14 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 15 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 16 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 17 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s + +.long 0x5e280803 //sha1h v3.16b,v0.16b // 18 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + +.long 0x5e280802 //sha1h v2.16b,v0.16b // 19 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + + add v1.4s,v1.4s,v2.4s + add v0.4s,v0.4s,v22.4s + + cbnz x2,Loop_hw + + st1 {v0.4s},[x0],#16 + st1 {v1.s}[0],[x0] + + ldr x29,[sp],#16 + ret + +.section .rodata +.align 6 +Lconst: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-apple.S index 7ae9b669..a3391770 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -5450,7 +5449,6 @@ K_XX_XX: .p2align 6 .text #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-linux.S index 912b74c5..fc01d95f 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -5450,7 +5449,6 @@ K_XX_XX: .align 64 .text #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha256-586-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-586-apple.S new file mode 100644 index 00000000..9447fcd3 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-586-apple.S @@ -0,0 +1,5598 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _sha256_block_data_order_nohw +.private_extern _sha256_block_data_order_nohw +.align 4 +_sha256_block_data_order_nohw: +L_sha256_block_data_order_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call L000pic_point +L000pic_point: + popl %ebp + leal LK256-L000pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $6,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) +L001no_xmm: + subl %edi,%eax + cmpl $256,%eax + jae L002unrolled + jmp L003loop +.align 4,0x90 +L003loop: + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + bswap %eax + movl 12(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 16(%edi),%eax + movl 20(%edi),%ebx + movl 24(%edi),%ecx + bswap %eax + movl 28(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 32(%edi),%eax + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %eax + movl 44(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 48(%edi),%eax + movl 52(%edi),%ebx + movl 56(%edi),%ecx + bswap %eax + movl 60(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + addl $64,%edi + leal -36(%esp),%esp + movl %edi,104(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,8(%esp) + xorl %ecx,%ebx + movl %ecx,12(%esp) + movl %edi,16(%esp) + movl %ebx,(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + movl %edi,32(%esp) +.align 4,0x90 +L00400_15: + movl %edx,%ecx + movl 24(%esp),%esi + rorl $14,%ecx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl 96(%esp),%ebx + rorl $5,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + rorl $6,%edx + movl %eax,%ecx + addl %esi,%ebx + rorl $9,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + rorl $11,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + addl $4,%ebp + addl %ebx,%eax + cmpl $3248222580,%esi + jne L00400_15 + movl 156(%esp),%ecx + jmp L00516_63 +.align 4,0x90 +L00516_63: + movl %ecx,%ebx + movl 104(%esp),%esi + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 160(%esp),%ebx + shrl $10,%edi + addl 124(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 24(%esp),%esi + rorl $14,%ecx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl %ebx,96(%esp) + rorl $5,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + rorl $6,%edx + movl %eax,%ecx + addl %esi,%ebx + rorl $9,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + rorl $11,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + movl 156(%esp),%ecx + addl $4,%ebp + addl %ebx,%eax + cmpl $3329325298,%esi + jne L00516_63 + movl 356(%esp),%esi + movl 8(%esp),%ebx + movl 16(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl 24(%esp),%eax + movl 28(%esp),%ebx + movl 32(%esp),%ecx + movl 360(%esp),%edi + addl 16(%esi),%edx + addl 20(%esi),%eax + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %eax,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + leal 356(%esp),%esp + subl $256,%ebp + cmpl 8(%esp),%edi + jb L003loop + movl 12(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +LK256: +.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 +.long 66051,67438087,134810123,202182159 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +.byte 62,0 +.align 4,0x90 +L002unrolled: + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebp + movl 8(%esi),%ecx + movl 12(%esi),%ebx + movl %ebp,4(%esp) + xorl %ecx,%ebp + movl %ecx,8(%esp) + movl %ebx,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %esi,28(%esp) + jmp L006grand_loop +.align 4,0x90 +L006grand_loop: + movl (%edi),%ebx + movl 4(%edi),%ecx + bswap %ebx + movl 8(%edi),%esi + bswap %ecx + movl %ebx,32(%esp) + bswap %esi + movl %ecx,36(%esp) + movl %esi,40(%esp) + movl 12(%edi),%ebx + movl 16(%edi),%ecx + bswap %ebx + movl 20(%edi),%esi + bswap %ecx + movl %ebx,44(%esp) + bswap %esi + movl %ecx,48(%esp) + movl %esi,52(%esp) + movl 24(%edi),%ebx + movl 28(%edi),%ecx + bswap %ebx + movl 32(%edi),%esi + bswap %ecx + movl %ebx,56(%esp) + bswap %esi + movl %ecx,60(%esp) + movl %esi,64(%esp) + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %ebx + movl 44(%edi),%esi + bswap %ecx + movl %ebx,68(%esp) + bswap %esi + movl %ecx,72(%esp) + movl %esi,76(%esp) + movl 48(%edi),%ebx + movl 52(%edi),%ecx + bswap %ebx + movl 56(%edi),%esi + bswap %ecx + movl %ebx,80(%esp) + bswap %esi + movl %ecx,84(%esp) + movl %esi,88(%esp) + movl 60(%edi),%ebx + addl $64,%edi + bswap %ebx + movl %edi,100(%esp) + movl %ebx,92(%esp) + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 32(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1116352408(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 36(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1899447441(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 40(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3049323471(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 44(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3921009573(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 48(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 961987163(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 52(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1508970993(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 56(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2453635748(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 60(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2870763221(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 64(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3624381080(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 68(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 310598401(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 72(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 607225278(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 76(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1426881987(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 80(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1925078388(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 84(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2162078206(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 88(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2614888103(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 92(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3248222580(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3835390401(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 4022224774(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 264347078(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 604807628(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 770255983(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1249150122(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1555081692(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1996064986(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2554220882(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2821834349(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2952996808(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3210313671(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3336571891(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3584528711(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 113926993(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 338241895(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 666307205(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 773529912(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1294757372(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1396182291(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1695183700(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1986661051(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2177026350(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2456956037(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2730485921(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2820302411(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3259730800(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3345764771(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3516065817(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3600352804(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 4094571909(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 275423344(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 430227734(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 506948616(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 659060556(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 883997877(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 958139571(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1322822218(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1537002063(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1747873779(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1955562222(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2024104815(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2227730452(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2361852424(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2428436474(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2756734187(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3204031479(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3329325298(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 96(%esp),%esi + xorl %edi,%ebp + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebp + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebp,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebp,4(%esp) + xorl %edi,%ebp + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ebx + movl 28(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + cmpl 104(%esp),%edi + jb L006grand_loop + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _sha256_block_data_order_ssse3 +.private_extern _sha256_block_data_order_ssse3 +.align 4 +_sha256_block_data_order_ssse3: +L_sha256_block_data_order_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call L007pic_point +L007pic_point: + popl %ebp + leal LK256-L007pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $6,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + movdqa 256(%ebp),%xmm7 + jmp L008grand_ssse3 +.align 4,0x90 +L008grand_ssse3: + movdqu (%edi),%xmm0 + movdqu 16(%edi),%xmm1 + movdqu 32(%edi),%xmm2 + movdqu 48(%edi),%xmm3 + addl $64,%edi +.byte 102,15,56,0,199 + movl %edi,100(%esp) +.byte 102,15,56,0,207 + movdqa (%ebp),%xmm4 +.byte 102,15,56,0,215 + movdqa 16(%ebp),%xmm5 + paddd %xmm0,%xmm4 +.byte 102,15,56,0,223 + movdqa 32(%ebp),%xmm6 + paddd %xmm1,%xmm5 + movdqa 48(%ebp),%xmm7 + movdqa %xmm4,32(%esp) + paddd %xmm2,%xmm6 + movdqa %xmm5,48(%esp) + paddd %xmm3,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + jmp L009ssse3_00_47 +.align 4,0x90 +L009ssse3_00_47: + addl $64,%ebp + movl %edx,%ecx + movdqa %xmm1,%xmm4 + rorl $14,%edx + movl 20(%esp),%esi + movdqa %xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi +.byte 102,15,58,15,224,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,250,4 + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 4(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm0 + movl %eax,(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm3,%xmm7 + xorl %esi,%ecx + addl 32(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl 16(%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,12(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm0 + movl %ebx,28(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm0 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + pshufd $80,%xmm0,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa (%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,4(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm0 + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + paddd %xmm0,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movdqa %xmm6,32(%esp) + movl %edx,%ecx + movdqa %xmm2,%xmm4 + rorl $14,%edx + movl 4(%esp),%esi + movdqa %xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi +.byte 102,15,58,15,225,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,251,4 + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 20(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm1 + movl %eax,16(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm0,%xmm7 + xorl %esi,%ecx + addl 48(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl (%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,28(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm1 + movl %ebx,12(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm1 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + pshufd $80,%xmm1,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 16(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,20(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm1 + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + paddd %xmm1,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movdqa %xmm6,48(%esp) + movl %edx,%ecx + movdqa %xmm3,%xmm4 + rorl $14,%edx + movl 20(%esp),%esi + movdqa %xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi +.byte 102,15,58,15,226,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,248,4 + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 4(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm2 + movl %eax,(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm1,%xmm7 + xorl %esi,%ecx + addl 64(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl 16(%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,12(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm2 + movl %ebx,28(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm2 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + pshufd $80,%xmm2,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 32(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,4(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm2 + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + paddd %xmm2,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movdqa %xmm6,64(%esp) + movl %edx,%ecx + movdqa %xmm0,%xmm4 + rorl $14,%edx + movl 4(%esp),%esi + movdqa %xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi +.byte 102,15,58,15,227,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,249,4 + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 20(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm3 + movl %eax,16(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm2,%xmm7 + xorl %esi,%ecx + addl 80(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl (%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,28(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm3 + movl %ebx,12(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm3 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + pshufd $80,%xmm3,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 48(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,20(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm3 + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + paddd %xmm3,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne L009ssse3_00_47 + movl %edx,%ecx + rorl $14,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + movdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb L008grand_ssse3 + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _sha256_block_data_order_avx +.private_extern _sha256_block_data_order_avx +.align 4 +_sha256_block_data_order_avx: +L_sha256_block_data_order_avx_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call L010pic_point +L010pic_point: + popl %ebp + leal LK256-L010pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $6,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp L011grand_avx +.align 5,0x90 +L011grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp L012avx_00_47 +.align 4,0x90 +L012avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne L012avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb L011grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-586-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-586-linux.S index 79af683c..4bc31d1d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-586-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -5599,7 +5598,6 @@ sha256_block_data_order_avx: ret .size sha256_block_data_order_avx,.-.L_sha256_block_data_order_avx_begin #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv4-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-linux.linux.arm.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-armv4-linux.S index 8e9f5522..0b3ba432 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-linux.linux.arm.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv4-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -2839,7 +2838,6 @@ sha256_block_data_order_hw: .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) -#endif // defined(__arm__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-apple.S index 341f867e..c7a89712 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1193,7 +1192,6 @@ Loop_hw: #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-linux.S index a2b88efb..99d3a3f7 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1193,7 +1192,6 @@ sha256_block_data_order_hw: .size sha256_block_data_order_hw,.-sha256_block_data_order_hw #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-win.S new file mode 100644 index 00000000..beb18820 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-win.S @@ -0,0 +1,1202 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +# include +#endif + +.text + +.globl sha256_block_data_order_nohw + +.def sha256_block_data_order_nohw + .type 32 +.endef +.align 6 +sha256_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adrp x30,LK256 + add x30,x30,:lo12:LK256 + stp x0,x2,[x29,#96] + +Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.section .rodata +.align 6 + +LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl sha256_block_data_order_hw + +.def sha256_block_data_order_hw + .type 32 +.endef +.align 6 +sha256_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adrp x3,LK256 + add x3,x3,:lo12:LK256 + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-apple.S index 8bb9c47d..d67f925c 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -4170,7 +4169,6 @@ L$epilogue_avx: #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-linux.S index 749b6bea..1023d3c4 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -4170,7 +4169,6 @@ _CET_ENDBR .cfi_endproc .size sha256_block_data_order_avx,.-sha256_block_data_order_avx #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha512-586-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-586-apple.S new file mode 100644 index 00000000..b7e6b86b --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-586-apple.S @@ -0,0 +1,2411 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _sha512_block_data_order_nohw +.private_extern _sha512_block_data_order_nohw +.align 4 +_sha512_block_data_order_nohw: +L_sha512_block_data_order_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call L000pic_point +L000pic_point: + popl %ebp + leal LK512-L000pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $7,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + movq (%esi),%mm0 + movq 8(%esi),%mm1 + movq 16(%esi),%mm2 + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 + subl $80,%esp + jmp L001loop_sse2 +.align 4,0x90 +L001loop_sse2: + movq %mm1,8(%esp) + movq %mm2,16(%esp) + movq %mm3,24(%esp) + movq %mm5,40(%esp) + movq %mm6,48(%esp) + pxor %mm1,%mm2 + movq %mm7,56(%esp) + movq %mm0,%mm3 + movl (%edi),%eax + movl 4(%edi),%ebx + addl $8,%edi + movl $15,%edx + bswap %eax + bswap %ebx + jmp L00200_14_sse2 +.align 4,0x90 +L00200_14_sse2: + movd %eax,%mm1 + movl (%edi),%eax + movd %ebx,%mm7 + movl 4(%edi),%ebx + addl $8,%edi + bswap %eax + bswap %ebx + punpckldq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm3,%mm0 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm2,%mm3 + movq %mm0,%mm2 + addl $8,%ebp + paddq %mm6,%mm3 + movq 48(%esp),%mm6 + decl %edx + jnz L00200_14_sse2 + movd %eax,%mm1 + movd %ebx,%mm7 + punpckldq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm3,%mm0 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm2,%mm3 + movq %mm0,%mm2 + addl $8,%ebp + paddq %mm6,%mm3 + pxor %mm0,%mm0 + movl $32,%edx + jmp L00316_79_sse2 +.align 4,0x90 +L00316_79_sse2: + movq 88(%esp),%mm5 + movq %mm7,%mm1 + psrlq $1,%mm7 + movq %mm5,%mm6 + psrlq $6,%mm5 + psllq $56,%mm1 + paddq %mm3,%mm0 + movq %mm7,%mm3 + psrlq $6,%mm7 + pxor %mm1,%mm3 + psllq $7,%mm1 + pxor %mm7,%mm3 + psrlq $1,%mm7 + pxor %mm1,%mm3 + movq %mm5,%mm1 + psrlq $13,%mm5 + pxor %mm3,%mm7 + psllq $3,%mm6 + pxor %mm5,%mm1 + paddq 200(%esp),%mm7 + pxor %mm6,%mm1 + psrlq $42,%mm5 + paddq 128(%esp),%mm7 + pxor %mm5,%mm1 + psllq $42,%mm6 + movq 40(%esp),%mm5 + pxor %mm6,%mm1 + movq 48(%esp),%mm6 + paddq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm6,%mm2 + addl $8,%ebp + movq 88(%esp),%mm5 + movq %mm7,%mm1 + psrlq $1,%mm7 + movq %mm5,%mm6 + psrlq $6,%mm5 + psllq $56,%mm1 + paddq %mm3,%mm2 + movq %mm7,%mm3 + psrlq $6,%mm7 + pxor %mm1,%mm3 + psllq $7,%mm1 + pxor %mm7,%mm3 + psrlq $1,%mm7 + pxor %mm1,%mm3 + movq %mm5,%mm1 + psrlq $13,%mm5 + pxor %mm3,%mm7 + psllq $3,%mm6 + pxor %mm5,%mm1 + paddq 200(%esp),%mm7 + pxor %mm6,%mm1 + psrlq $42,%mm5 + paddq 128(%esp),%mm7 + pxor %mm5,%mm1 + psllq $42,%mm6 + movq 40(%esp),%mm5 + pxor %mm6,%mm1 + movq 48(%esp),%mm6 + paddq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm6,%mm0 + addl $8,%ebp + decl %edx + jnz L00316_79_sse2 + paddq %mm3,%mm0 + movq 8(%esp),%mm1 + movq 24(%esp),%mm3 + movq 40(%esp),%mm5 + movq 48(%esp),%mm6 + movq 56(%esp),%mm7 + pxor %mm1,%mm2 + paddq (%esi),%mm0 + paddq 8(%esi),%mm1 + paddq 16(%esi),%mm2 + paddq 24(%esi),%mm3 + paddq 32(%esi),%mm4 + paddq 40(%esi),%mm5 + paddq 48(%esi),%mm6 + paddq 56(%esi),%mm7 + movl $640,%eax + movq %mm0,(%esi) + movq %mm1,8(%esi) + movq %mm2,16(%esi) + movq %mm3,24(%esi) + movq %mm4,32(%esi) + movq %mm5,40(%esi) + movq %mm6,48(%esi) + movq %mm7,56(%esi) + leal (%esp,%eax,1),%esp + subl %eax,%ebp + cmpl 88(%esp),%edi + jb L001loop_sse2 + movl 92(%esp),%esp + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _sha512_block_data_order_ssse3 +.private_extern _sha512_block_data_order_ssse3 +.align 4 +_sha512_block_data_order_ssse3: +L_sha512_block_data_order_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call L004pic_point +L004pic_point: + popl %ebp + leal LK512-L004pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $7,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + movq (%esi),%mm0 + movq 8(%esi),%mm1 + movq 16(%esi),%mm2 + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 + leal -64(%esp),%edx + subl $256,%esp + movdqa 640(%ebp),%xmm1 + movdqu (%edi),%xmm0 +.byte 102,15,56,0,193 + movdqa (%ebp),%xmm3 + movdqa %xmm1,%xmm2 + movdqu 16(%edi),%xmm1 + paddq %xmm0,%xmm3 +.byte 102,15,56,0,202 + movdqa %xmm3,-128(%edx) + movdqa 16(%ebp),%xmm4 + movdqa %xmm2,%xmm3 + movdqu 32(%edi),%xmm2 + paddq %xmm1,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm4,-112(%edx) + movdqa 32(%ebp),%xmm5 + movdqa %xmm3,%xmm4 + movdqu 48(%edi),%xmm3 + paddq %xmm2,%xmm5 +.byte 102,15,56,0,220 + movdqa %xmm5,-96(%edx) + movdqa 48(%ebp),%xmm6 + movdqa %xmm4,%xmm5 + movdqu 64(%edi),%xmm4 + paddq %xmm3,%xmm6 +.byte 102,15,56,0,229 + movdqa %xmm6,-80(%edx) + movdqa 64(%ebp),%xmm7 + movdqa %xmm5,%xmm6 + movdqu 80(%edi),%xmm5 + paddq %xmm4,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm7,-64(%edx) + movdqa %xmm0,(%edx) + movdqa 80(%ebp),%xmm0 + movdqa %xmm6,%xmm7 + movdqu 96(%edi),%xmm6 + paddq %xmm5,%xmm0 +.byte 102,15,56,0,247 + movdqa %xmm0,-48(%edx) + movdqa %xmm1,16(%edx) + movdqa 96(%ebp),%xmm1 + movdqa %xmm7,%xmm0 + movdqu 112(%edi),%xmm7 + paddq %xmm6,%xmm1 +.byte 102,15,56,0,248 + movdqa %xmm1,-32(%edx) + movdqa %xmm2,32(%edx) + movdqa 112(%ebp),%xmm2 + movdqa (%edx),%xmm0 + paddq %xmm7,%xmm2 + movdqa %xmm2,-16(%edx) + nop +.align 5,0x90 +L005loop_ssse3: + movdqa 16(%edx),%xmm2 + movdqa %xmm3,48(%edx) + leal 128(%ebp),%ebp + movq %mm1,8(%esp) + movl %edi,%ebx + movq %mm2,16(%esp) + leal 128(%edi),%edi + movq %mm3,24(%esp) + cmpl %eax,%edi + movq %mm5,40(%esp) + cmovbl %edi,%ebx + movq %mm6,48(%esp) + movl $4,%ecx + pxor %mm1,%mm2 + movq %mm7,56(%esp) + pxor %mm3,%mm3 + jmp L00600_47_ssse3 +.align 5,0x90 +L00600_47_ssse3: + movdqa %xmm5,%xmm3 + movdqa %xmm2,%xmm1 +.byte 102,15,58,15,208,8 + movdqa %xmm4,(%edx) +.byte 102,15,58,15,220,8 + movdqa %xmm2,%xmm4 + psrlq $7,%xmm2 + paddq %xmm3,%xmm0 + movdqa %xmm4,%xmm3 + psrlq $1,%xmm4 + psllq $56,%xmm3 + pxor %xmm4,%xmm2 + psrlq $7,%xmm4 + pxor %xmm3,%xmm2 + psllq $7,%xmm3 + pxor %xmm4,%xmm2 + movdqa %xmm7,%xmm4 + pxor %xmm3,%xmm2 + movdqa %xmm7,%xmm3 + psrlq $6,%xmm4 + paddq %xmm2,%xmm0 + movdqa %xmm7,%xmm2 + psrlq $19,%xmm3 + psllq $3,%xmm2 + pxor %xmm3,%xmm4 + psrlq $42,%xmm3 + pxor %xmm2,%xmm4 + psllq $42,%xmm2 + pxor %xmm3,%xmm4 + movdqa 32(%edx),%xmm3 + pxor %xmm2,%xmm4 + movdqa (%ebp),%xmm2 + movq %mm4,%mm1 + paddq %xmm4,%xmm0 + movq -128(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + paddq %xmm0,%xmm2 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -120(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm2,-128(%edx) + movdqa %xmm6,%xmm4 + movdqa %xmm3,%xmm2 +.byte 102,15,58,15,217,8 + movdqa %xmm5,16(%edx) +.byte 102,15,58,15,229,8 + movdqa %xmm3,%xmm5 + psrlq $7,%xmm3 + paddq %xmm4,%xmm1 + movdqa %xmm5,%xmm4 + psrlq $1,%xmm5 + psllq $56,%xmm4 + pxor %xmm5,%xmm3 + psrlq $7,%xmm5 + pxor %xmm4,%xmm3 + psllq $7,%xmm4 + pxor %xmm5,%xmm3 + movdqa %xmm0,%xmm5 + pxor %xmm4,%xmm3 + movdqa %xmm0,%xmm4 + psrlq $6,%xmm5 + paddq %xmm3,%xmm1 + movdqa %xmm0,%xmm3 + psrlq $19,%xmm4 + psllq $3,%xmm3 + pxor %xmm4,%xmm5 + psrlq $42,%xmm4 + pxor %xmm3,%xmm5 + psllq $42,%xmm3 + pxor %xmm4,%xmm5 + movdqa 48(%edx),%xmm4 + pxor %xmm3,%xmm5 + movdqa 16(%ebp),%xmm3 + movq %mm4,%mm1 + paddq %xmm5,%xmm1 + movq -112(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + paddq %xmm1,%xmm3 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -104(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm3,-112(%edx) + movdqa %xmm7,%xmm5 + movdqa %xmm4,%xmm3 +.byte 102,15,58,15,226,8 + movdqa %xmm6,32(%edx) +.byte 102,15,58,15,238,8 + movdqa %xmm4,%xmm6 + psrlq $7,%xmm4 + paddq %xmm5,%xmm2 + movdqa %xmm6,%xmm5 + psrlq $1,%xmm6 + psllq $56,%xmm5 + pxor %xmm6,%xmm4 + psrlq $7,%xmm6 + pxor %xmm5,%xmm4 + psllq $7,%xmm5 + pxor %xmm6,%xmm4 + movdqa %xmm1,%xmm6 + pxor %xmm5,%xmm4 + movdqa %xmm1,%xmm5 + psrlq $6,%xmm6 + paddq %xmm4,%xmm2 + movdqa %xmm1,%xmm4 + psrlq $19,%xmm5 + psllq $3,%xmm4 + pxor %xmm5,%xmm6 + psrlq $42,%xmm5 + pxor %xmm4,%xmm6 + psllq $42,%xmm4 + pxor %xmm5,%xmm6 + movdqa (%edx),%xmm5 + pxor %xmm4,%xmm6 + movdqa 32(%ebp),%xmm4 + movq %mm4,%mm1 + paddq %xmm6,%xmm2 + movq -96(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + paddq %xmm2,%xmm4 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -88(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm4,-96(%edx) + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm4 +.byte 102,15,58,15,235,8 + movdqa %xmm7,48(%edx) +.byte 102,15,58,15,247,8 + movdqa %xmm5,%xmm7 + psrlq $7,%xmm5 + paddq %xmm6,%xmm3 + movdqa %xmm7,%xmm6 + psrlq $1,%xmm7 + psllq $56,%xmm6 + pxor %xmm7,%xmm5 + psrlq $7,%xmm7 + pxor %xmm6,%xmm5 + psllq $7,%xmm6 + pxor %xmm7,%xmm5 + movdqa %xmm2,%xmm7 + pxor %xmm6,%xmm5 + movdqa %xmm2,%xmm6 + psrlq $6,%xmm7 + paddq %xmm5,%xmm3 + movdqa %xmm2,%xmm5 + psrlq $19,%xmm6 + psllq $3,%xmm5 + pxor %xmm6,%xmm7 + psrlq $42,%xmm6 + pxor %xmm5,%xmm7 + psllq $42,%xmm5 + pxor %xmm6,%xmm7 + movdqa 16(%edx),%xmm6 + pxor %xmm5,%xmm7 + movdqa 48(%ebp),%xmm5 + movq %mm4,%mm1 + paddq %xmm7,%xmm3 + movq -80(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + paddq %xmm3,%xmm5 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -72(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm5,-80(%edx) + movdqa %xmm1,%xmm7 + movdqa %xmm6,%xmm5 +.byte 102,15,58,15,244,8 + movdqa %xmm0,(%edx) +.byte 102,15,58,15,248,8 + movdqa %xmm6,%xmm0 + psrlq $7,%xmm6 + paddq %xmm7,%xmm4 + movdqa %xmm0,%xmm7 + psrlq $1,%xmm0 + psllq $56,%xmm7 + pxor %xmm0,%xmm6 + psrlq $7,%xmm0 + pxor %xmm7,%xmm6 + psllq $7,%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm3,%xmm0 + pxor %xmm7,%xmm6 + movdqa %xmm3,%xmm7 + psrlq $6,%xmm0 + paddq %xmm6,%xmm4 + movdqa %xmm3,%xmm6 + psrlq $19,%xmm7 + psllq $3,%xmm6 + pxor %xmm7,%xmm0 + psrlq $42,%xmm7 + pxor %xmm6,%xmm0 + psllq $42,%xmm6 + pxor %xmm7,%xmm0 + movdqa 32(%edx),%xmm7 + pxor %xmm6,%xmm0 + movdqa 64(%ebp),%xmm6 + movq %mm4,%mm1 + paddq %xmm0,%xmm4 + movq -64(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + paddq %xmm4,%xmm6 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -56(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm6,-64(%edx) + movdqa %xmm2,%xmm0 + movdqa %xmm7,%xmm6 +.byte 102,15,58,15,253,8 + movdqa %xmm1,16(%edx) +.byte 102,15,58,15,193,8 + movdqa %xmm7,%xmm1 + psrlq $7,%xmm7 + paddq %xmm0,%xmm5 + movdqa %xmm1,%xmm0 + psrlq $1,%xmm1 + psllq $56,%xmm0 + pxor %xmm1,%xmm7 + psrlq $7,%xmm1 + pxor %xmm0,%xmm7 + psllq $7,%xmm0 + pxor %xmm1,%xmm7 + movdqa %xmm4,%xmm1 + pxor %xmm0,%xmm7 + movdqa %xmm4,%xmm0 + psrlq $6,%xmm1 + paddq %xmm7,%xmm5 + movdqa %xmm4,%xmm7 + psrlq $19,%xmm0 + psllq $3,%xmm7 + pxor %xmm0,%xmm1 + psrlq $42,%xmm0 + pxor %xmm7,%xmm1 + psllq $42,%xmm7 + pxor %xmm0,%xmm1 + movdqa 48(%edx),%xmm0 + pxor %xmm7,%xmm1 + movdqa 80(%ebp),%xmm7 + movq %mm4,%mm1 + paddq %xmm1,%xmm5 + movq -48(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + paddq %xmm5,%xmm7 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -40(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm7,-48(%edx) + movdqa %xmm3,%xmm1 + movdqa %xmm0,%xmm7 +.byte 102,15,58,15,198,8 + movdqa %xmm2,32(%edx) +.byte 102,15,58,15,202,8 + movdqa %xmm0,%xmm2 + psrlq $7,%xmm0 + paddq %xmm1,%xmm6 + movdqa %xmm2,%xmm1 + psrlq $1,%xmm2 + psllq $56,%xmm1 + pxor %xmm2,%xmm0 + psrlq $7,%xmm2 + pxor %xmm1,%xmm0 + psllq $7,%xmm1 + pxor %xmm2,%xmm0 + movdqa %xmm5,%xmm2 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm1 + psrlq $6,%xmm2 + paddq %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + psrlq $19,%xmm1 + psllq $3,%xmm0 + pxor %xmm1,%xmm2 + psrlq $42,%xmm1 + pxor %xmm0,%xmm2 + psllq $42,%xmm0 + pxor %xmm1,%xmm2 + movdqa (%edx),%xmm1 + pxor %xmm0,%xmm2 + movdqa 96(%ebp),%xmm0 + movq %mm4,%mm1 + paddq %xmm2,%xmm6 + movq -32(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + paddq %xmm6,%xmm0 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -24(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm0,-32(%edx) + movdqa %xmm4,%xmm2 + movdqa %xmm1,%xmm0 +.byte 102,15,58,15,207,8 + movdqa %xmm3,48(%edx) +.byte 102,15,58,15,211,8 + movdqa %xmm1,%xmm3 + psrlq $7,%xmm1 + paddq %xmm2,%xmm7 + movdqa %xmm3,%xmm2 + psrlq $1,%xmm3 + psllq $56,%xmm2 + pxor %xmm3,%xmm1 + psrlq $7,%xmm3 + pxor %xmm2,%xmm1 + psllq $7,%xmm2 + pxor %xmm3,%xmm1 + movdqa %xmm6,%xmm3 + pxor %xmm2,%xmm1 + movdqa %xmm6,%xmm2 + psrlq $6,%xmm3 + paddq %xmm1,%xmm7 + movdqa %xmm6,%xmm1 + psrlq $19,%xmm2 + psllq $3,%xmm1 + pxor %xmm2,%xmm3 + psrlq $42,%xmm2 + pxor %xmm1,%xmm3 + psllq $42,%xmm1 + pxor %xmm2,%xmm3 + movdqa 16(%edx),%xmm2 + pxor %xmm1,%xmm3 + movdqa 112(%ebp),%xmm1 + movq %mm4,%mm1 + paddq %xmm3,%xmm7 + movq -16(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + paddq %xmm7,%xmm1 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -8(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm1,-16(%edx) + leal 128(%ebp),%ebp + decl %ecx + jnz L00600_47_ssse3 + movdqa (%ebp),%xmm1 + leal -640(%ebp),%ebp + movdqu (%ebx),%xmm0 +.byte 102,15,56,0,193 + movdqa (%ebp),%xmm3 + movdqa %xmm1,%xmm2 + movdqu 16(%ebx),%xmm1 + paddq %xmm0,%xmm3 +.byte 102,15,56,0,202 + movq %mm4,%mm1 + movq -128(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -120(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm3,-128(%edx) + movdqa 16(%ebp),%xmm4 + movdqa %xmm2,%xmm3 + movdqu 32(%ebx),%xmm2 + paddq %xmm1,%xmm4 +.byte 102,15,56,0,211 + movq %mm4,%mm1 + movq -112(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -104(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm4,-112(%edx) + movdqa 32(%ebp),%xmm5 + movdqa %xmm3,%xmm4 + movdqu 48(%ebx),%xmm3 + paddq %xmm2,%xmm5 +.byte 102,15,56,0,220 + movq %mm4,%mm1 + movq -96(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -88(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm5,-96(%edx) + movdqa 48(%ebp),%xmm6 + movdqa %xmm4,%xmm5 + movdqu 64(%ebx),%xmm4 + paddq %xmm3,%xmm6 +.byte 102,15,56,0,229 + movq %mm4,%mm1 + movq -80(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -72(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm6,-80(%edx) + movdqa 64(%ebp),%xmm7 + movdqa %xmm5,%xmm6 + movdqu 80(%ebx),%xmm5 + paddq %xmm4,%xmm7 +.byte 102,15,56,0,238 + movq %mm4,%mm1 + movq -64(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -56(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm7,-64(%edx) + movdqa %xmm0,(%edx) + movdqa 80(%ebp),%xmm0 + movdqa %xmm6,%xmm7 + movdqu 96(%ebx),%xmm6 + paddq %xmm5,%xmm0 +.byte 102,15,56,0,247 + movq %mm4,%mm1 + movq -48(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -40(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm0,-48(%edx) + movdqa %xmm1,16(%edx) + movdqa 96(%ebp),%xmm1 + movdqa %xmm7,%xmm0 + movdqu 112(%ebx),%xmm7 + paddq %xmm6,%xmm1 +.byte 102,15,56,0,248 + movq %mm4,%mm1 + movq -32(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -24(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm1,-32(%edx) + movdqa %xmm2,32(%edx) + movdqa 112(%ebp),%xmm2 + movdqa (%edx),%xmm0 + paddq %xmm7,%xmm2 + movq %mm4,%mm1 + movq -16(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -8(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm2,-16(%edx) + movq 8(%esp),%mm1 + paddq %mm3,%mm0 + movq 24(%esp),%mm3 + movq 56(%esp),%mm7 + pxor %mm1,%mm2 + paddq (%esi),%mm0 + paddq 8(%esi),%mm1 + paddq 16(%esi),%mm2 + paddq 24(%esi),%mm3 + paddq 32(%esi),%mm4 + paddq 40(%esi),%mm5 + paddq 48(%esi),%mm6 + paddq 56(%esi),%mm7 + movq %mm0,(%esi) + movq %mm1,8(%esi) + movq %mm2,16(%esi) + movq %mm3,24(%esi) + movq %mm4,32(%esi) + movq %mm5,40(%esi) + movq %mm6,48(%esi) + movq %mm7,56(%esi) + cmpl %eax,%edi + jb L005loop_ssse3 + movl 76(%edx),%esp + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +LK512: +.long 3609767458,1116352408 +.long 602891725,1899447441 +.long 3964484399,3049323471 +.long 2173295548,3921009573 +.long 4081628472,961987163 +.long 3053834265,1508970993 +.long 2937671579,2453635748 +.long 3664609560,2870763221 +.long 2734883394,3624381080 +.long 1164996542,310598401 +.long 1323610764,607225278 +.long 3590304994,1426881987 +.long 4068182383,1925078388 +.long 991336113,2162078206 +.long 633803317,2614888103 +.long 3479774868,3248222580 +.long 2666613458,3835390401 +.long 944711139,4022224774 +.long 2341262773,264347078 +.long 2007800933,604807628 +.long 1495990901,770255983 +.long 1856431235,1249150122 +.long 3175218132,1555081692 +.long 2198950837,1996064986 +.long 3999719339,2554220882 +.long 766784016,2821834349 +.long 2566594879,2952996808 +.long 3203337956,3210313671 +.long 1034457026,3336571891 +.long 2466948901,3584528711 +.long 3758326383,113926993 +.long 168717936,338241895 +.long 1188179964,666307205 +.long 1546045734,773529912 +.long 1522805485,1294757372 +.long 2643833823,1396182291 +.long 2343527390,1695183700 +.long 1014477480,1986661051 +.long 1206759142,2177026350 +.long 344077627,2456956037 +.long 1290863460,2730485921 +.long 3158454273,2820302411 +.long 3505952657,3259730800 +.long 106217008,3345764771 +.long 3606008344,3516065817 +.long 1432725776,3600352804 +.long 1467031594,4094571909 +.long 851169720,275423344 +.long 3100823752,430227734 +.long 1363258195,506948616 +.long 3750685593,659060556 +.long 3785050280,883997877 +.long 3318307427,958139571 +.long 3812723403,1322822218 +.long 2003034995,1537002063 +.long 3602036899,1747873779 +.long 1575990012,1955562222 +.long 1125592928,2024104815 +.long 2716904306,2227730452 +.long 442776044,2361852424 +.long 593698344,2428436474 +.long 3733110249,2756734187 +.long 2999351573,3204031479 +.long 3815920427,3329325298 +.long 3928383900,3391569614 +.long 566280711,3515267271 +.long 3454069534,3940187606 +.long 4000239992,4118630271 +.long 1914138554,116418474 +.long 2731055270,174292421 +.long 3203993006,289380356 +.long 320620315,460393269 +.long 587496836,685471733 +.long 1086792851,852142971 +.long 365543100,1017036298 +.long 2618297676,1126000580 +.long 3409855158,1288033470 +.long 4234509866,1501505948 +.long 987167468,1607167915 +.long 1246189591,1816402316 +.long 67438087,66051 +.long 202182159,134810123 +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 +.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +.byte 62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-586-linux.S similarity index 83% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-586-linux.S index aa1ada92..cdd4b413 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-586-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -7,12 +6,12 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) .text -.globl sha512_block_data_order -.hidden sha512_block_data_order -.type sha512_block_data_order,@function +.globl sha512_block_data_order_nohw +.hidden sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,@function .align 16 -sha512_block_data_order: -.L_sha512_block_data_order_begin: +sha512_block_data_order_nohw: +.L_sha512_block_data_order_nohw_begin: pushl %ebp pushl %ebx pushl %esi @@ -24,7 +23,7 @@ sha512_block_data_order: call .L000pic_point .L000pic_point: popl %ebp - leal .L001K512-.L000pic_point(%ebp),%ebp + leal .LK512-.L000pic_point(%ebp),%ebp subl $16,%esp andl $-64,%esp shll $7,%eax @@ -33,28 +32,18 @@ sha512_block_data_order: movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) - leal OPENSSL_ia32cap_P-.L001K512(%ebp),%edx - movl (%edx),%ecx - testl $67108864,%ecx - jz .L002loop_x86 - movl 4(%edx),%edx movq (%esi),%mm0 - andl $16777216,%ecx movq 8(%esi),%mm1 - andl $512,%edx movq 16(%esi),%mm2 - orl %edx,%ecx movq 24(%esi),%mm3 movq 32(%esi),%mm4 movq 40(%esi),%mm5 movq 48(%esi),%mm6 movq 56(%esi),%mm7 - cmpl $16777728,%ecx - je .L003SSSE3 subl $80,%esp - jmp .L004loop_sse2 + jmp .L001loop_sse2 .align 16 -.L004loop_sse2: +.L001loop_sse2: movq %mm1,8(%esp) movq %mm2,16(%esp) movq %mm3,24(%esp) @@ -69,9 +58,9 @@ sha512_block_data_order: movl $15,%edx bswap %eax bswap %ebx - jmp .L00500_14_sse2 + jmp .L00200_14_sse2 .align 16 -.L00500_14_sse2: +.L00200_14_sse2: movd %eax,%mm1 movl (%edi),%eax movd %ebx,%mm7 @@ -132,7 +121,7 @@ sha512_block_data_order: paddq %mm6,%mm3 movq 48(%esp),%mm6 decl %edx - jnz .L00500_14_sse2 + jnz .L00200_14_sse2 movd %eax,%mm1 movd %ebx,%mm7 punpckldq %mm1,%mm7 @@ -188,9 +177,9 @@ sha512_block_data_order: paddq %mm6,%mm3 pxor %mm0,%mm0 movl $32,%edx - jmp .L00616_79_sse2 + jmp .L00316_79_sse2 .align 16 -.L00616_79_sse2: +.L00316_79_sse2: movq 88(%esp),%mm5 movq %mm7,%mm1 psrlq $1,%mm7 @@ -344,7 +333,7 @@ sha512_block_data_order: paddq %mm6,%mm0 addl $8,%ebp decl %edx - jnz .L00616_79_sse2 + jnz .L00316_79_sse2 paddq %mm3,%mm0 movq 8(%esp),%mm1 movq 24(%esp),%mm3 @@ -372,7 +361,7 @@ sha512_block_data_order: leal (%esp,%eax,1),%esp subl %eax,%ebp cmpl 88(%esp),%edi - jb .L004loop_sse2 + jb .L001loop_sse2 movl 92(%esp),%esp emms popl %edi @@ -380,8 +369,41 @@ sha512_block_data_order: popl %ebx popl %ebp ret -.align 32 -.L003SSSE3: +.size sha512_block_data_order_nohw,.-.L_sha512_block_data_order_nohw_begin +.globl sha512_block_data_order_ssse3 +.hidden sha512_block_data_order_ssse3 +.type sha512_block_data_order_ssse3,@function +.align 16 +sha512_block_data_order_ssse3: +.L_sha512_block_data_order_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call .L004pic_point +.L004pic_point: + popl %ebp + leal .LK512-.L004pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $7,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + movq (%esi),%mm0 + movq 8(%esi),%mm1 + movq 16(%esi),%mm2 + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 leal -64(%esp),%edx subl $256,%esp movdqa 640(%ebp),%xmm1 @@ -438,7 +460,7 @@ sha512_block_data_order: movdqa %xmm2,-16(%edx) nop .align 32 -.L007loop_ssse3: +.L005loop_ssse3: movdqa 16(%edx),%xmm2 movdqa %xmm3,48(%edx) leal 128(%ebp),%ebp @@ -455,9 +477,9 @@ sha512_block_data_order: pxor %mm1,%mm2 movq %mm7,56(%esp) pxor %mm3,%mm3 - jmp .L00800_47_ssse3 + jmp .L00600_47_ssse3 .align 32 -.L00800_47_ssse3: +.L00600_47_ssse3: movdqa %xmm5,%xmm3 movdqa %xmm2,%xmm1 .byte 102,15,58,15,208,8 @@ -1476,7 +1498,7 @@ sha512_block_data_order: movdqa %xmm1,-16(%edx) leal 128(%ebp),%ebp decl %ecx - jnz .L00800_47_ssse3 + jnz .L00600_47_ssse3 movdqa (%ebp),%xmm1 leal -640(%ebp),%ebp movdqu (%ebx),%xmm0 @@ -2288,7 +2310,7 @@ sha512_block_data_order: movq %mm6,48(%esi) movq %mm7,56(%esi) cmpl %eax,%edi - jb .L007loop_ssse3 + jb .L005loop_ssse3 movl 76(%edx),%esp emms popl %edi @@ -2296,456 +2318,9 @@ sha512_block_data_order: popl %ebx popl %ebp ret -.align 16 -.L002loop_x86: - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 16(%edi),%eax - movl 20(%edi),%ebx - movl 24(%edi),%ecx - movl 28(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 32(%edi),%eax - movl 36(%edi),%ebx - movl 40(%edi),%ecx - movl 44(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 48(%edi),%eax - movl 52(%edi),%ebx - movl 56(%edi),%ecx - movl 60(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 64(%edi),%eax - movl 68(%edi),%ebx - movl 72(%edi),%ecx - movl 76(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 80(%edi),%eax - movl 84(%edi),%ebx - movl 88(%edi),%ecx - movl 92(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 96(%edi),%eax - movl 100(%edi),%ebx - movl 104(%edi),%ecx - movl 108(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 112(%edi),%eax - movl 116(%edi),%ebx - movl 120(%edi),%ecx - movl 124(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - addl $128,%edi - subl $72,%esp - movl %edi,204(%esp) - leal 8(%esp),%edi - movl $16,%ecx -.long 2784229001 -.align 16 -.L00900_15_x86: - movl 40(%esp),%ecx - movl 44(%esp),%edx - movl %ecx,%esi - shrl $9,%ecx - movl %edx,%edi - shrl $9,%edx - movl %ecx,%ebx - shll $14,%esi - movl %edx,%eax - shll $14,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%eax - shll $4,%esi - xorl %edx,%ebx - shll $4,%edi - xorl %esi,%ebx - shrl $4,%ecx - xorl %edi,%eax - shrl $4,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 48(%esp),%ecx - movl 52(%esp),%edx - movl 56(%esp),%esi - movl 60(%esp),%edi - addl 64(%esp),%eax - adcl 68(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - andl 40(%esp),%ecx - andl 44(%esp),%edx - addl 192(%esp),%eax - adcl 196(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - movl (%ebp),%esi - movl 4(%ebp),%edi - addl %ecx,%eax - adcl %edx,%ebx - movl 32(%esp),%ecx - movl 36(%esp),%edx - addl %esi,%eax - adcl %edi,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - addl %ecx,%eax - adcl %edx,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl %eax,32(%esp) - movl %ebx,36(%esp) - movl %ecx,%esi - shrl $2,%ecx - movl %edx,%edi - shrl $2,%edx - movl %ecx,%ebx - shll $4,%esi - movl %edx,%eax - shll $4,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%ebx - shll $21,%esi - xorl %edx,%eax - shll $21,%edi - xorl %esi,%eax - shrl $21,%ecx - xorl %edi,%ebx - shrl $21,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl 16(%esp),%esi - movl 20(%esp),%edi - addl (%esp),%eax - adcl 4(%esp),%ebx - orl %esi,%ecx - orl %edi,%edx - andl 24(%esp),%ecx - andl 28(%esp),%edx - andl 8(%esp),%esi - andl 12(%esp),%edi - orl %esi,%ecx - orl %edi,%edx - addl %ecx,%eax - adcl %edx,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - movb (%ebp),%dl - subl $8,%esp - leal 8(%ebp),%ebp - cmpb $148,%dl - jne .L00900_15_x86 -.align 16 -.L01016_79_x86: - movl 312(%esp),%ecx - movl 316(%esp),%edx - movl %ecx,%esi - shrl $1,%ecx - movl %edx,%edi - shrl $1,%edx - movl %ecx,%eax - shll $24,%esi - movl %edx,%ebx - shll $24,%edi - xorl %esi,%ebx - shrl $6,%ecx - xorl %edi,%eax - shrl $6,%edx - xorl %ecx,%eax - shll $7,%esi - xorl %edx,%ebx - shll $1,%edi - xorl %esi,%ebx - shrl $1,%ecx - xorl %edi,%eax - shrl $1,%edx - xorl %ecx,%eax - shll $6,%edi - xorl %edx,%ebx - xorl %edi,%eax - movl %eax,(%esp) - movl %ebx,4(%esp) - movl 208(%esp),%ecx - movl 212(%esp),%edx - movl %ecx,%esi - shrl $6,%ecx - movl %edx,%edi - shrl $6,%edx - movl %ecx,%eax - shll $3,%esi - movl %edx,%ebx - shll $3,%edi - xorl %esi,%eax - shrl $13,%ecx - xorl %edi,%ebx - shrl $13,%edx - xorl %ecx,%eax - shll $10,%esi - xorl %edx,%ebx - shll $10,%edi - xorl %esi,%ebx - shrl $10,%ecx - xorl %edi,%eax - shrl $10,%edx - xorl %ecx,%ebx - shll $13,%edi - xorl %edx,%eax - xorl %edi,%eax - movl 320(%esp),%ecx - movl 324(%esp),%edx - addl (%esp),%eax - adcl 4(%esp),%ebx - movl 248(%esp),%esi - movl 252(%esp),%edi - addl %ecx,%eax - adcl %edx,%ebx - addl %esi,%eax - adcl %edi,%ebx - movl %eax,192(%esp) - movl %ebx,196(%esp) - movl 40(%esp),%ecx - movl 44(%esp),%edx - movl %ecx,%esi - shrl $9,%ecx - movl %edx,%edi - shrl $9,%edx - movl %ecx,%ebx - shll $14,%esi - movl %edx,%eax - shll $14,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%eax - shll $4,%esi - xorl %edx,%ebx - shll $4,%edi - xorl %esi,%ebx - shrl $4,%ecx - xorl %edi,%eax - shrl $4,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 48(%esp),%ecx - movl 52(%esp),%edx - movl 56(%esp),%esi - movl 60(%esp),%edi - addl 64(%esp),%eax - adcl 68(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - andl 40(%esp),%ecx - andl 44(%esp),%edx - addl 192(%esp),%eax - adcl 196(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - movl (%ebp),%esi - movl 4(%ebp),%edi - addl %ecx,%eax - adcl %edx,%ebx - movl 32(%esp),%ecx - movl 36(%esp),%edx - addl %esi,%eax - adcl %edi,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - addl %ecx,%eax - adcl %edx,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl %eax,32(%esp) - movl %ebx,36(%esp) - movl %ecx,%esi - shrl $2,%ecx - movl %edx,%edi - shrl $2,%edx - movl %ecx,%ebx - shll $4,%esi - movl %edx,%eax - shll $4,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%ebx - shll $21,%esi - xorl %edx,%eax - shll $21,%edi - xorl %esi,%eax - shrl $21,%ecx - xorl %edi,%ebx - shrl $21,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl 16(%esp),%esi - movl 20(%esp),%edi - addl (%esp),%eax - adcl 4(%esp),%ebx - orl %esi,%ecx - orl %edi,%edx - andl 24(%esp),%ecx - andl 28(%esp),%edx - andl 8(%esp),%esi - andl 12(%esp),%edi - orl %esi,%ecx - orl %edi,%edx - addl %ecx,%eax - adcl %edx,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - movb (%ebp),%dl - subl $8,%esp - leal 8(%ebp),%ebp - cmpb $23,%dl - jne .L01016_79_x86 - movl 840(%esp),%esi - movl 844(%esp),%edi - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edx - addl 8(%esp),%eax - adcl 12(%esp),%ebx - movl %eax,(%esi) - movl %ebx,4(%esi) - addl 16(%esp),%ecx - adcl 20(%esp),%edx - movl %ecx,8(%esi) - movl %edx,12(%esi) - movl 16(%esi),%eax - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edx - addl 24(%esp),%eax - adcl 28(%esp),%ebx - movl %eax,16(%esi) - movl %ebx,20(%esi) - addl 32(%esp),%ecx - adcl 36(%esp),%edx - movl %ecx,24(%esi) - movl %edx,28(%esi) - movl 32(%esi),%eax - movl 36(%esi),%ebx - movl 40(%esi),%ecx - movl 44(%esi),%edx - addl 40(%esp),%eax - adcl 44(%esp),%ebx - movl %eax,32(%esi) - movl %ebx,36(%esi) - addl 48(%esp),%ecx - adcl 52(%esp),%edx - movl %ecx,40(%esi) - movl %edx,44(%esi) - movl 48(%esi),%eax - movl 52(%esi),%ebx - movl 56(%esi),%ecx - movl 60(%esi),%edx - addl 56(%esp),%eax - adcl 60(%esp),%ebx - movl %eax,48(%esi) - movl %ebx,52(%esi) - addl 64(%esp),%ecx - adcl 68(%esp),%edx - movl %ecx,56(%esi) - movl %edx,60(%esi) - addl $840,%esp - subl $640,%ebp - cmpl 8(%esp),%edi - jb .L002loop_x86 - movl 12(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret +.size sha512_block_data_order_ssse3,.-.L_sha512_block_data_order_ssse3_begin .align 64 -.L001K512: +.LK512: .long 3609767458,1116352408 .long 602891725,1899447441 .long 3964484399,3049323471 @@ -2828,14 +2403,12 @@ sha512_block_data_order: .long 1246189591,1816402316 .long 67438087,66051 .long 202182159,134810123 -.size sha512_block_data_order,.-.L_sha512_block_data_order_begin .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv4-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-linux.linux.arm.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-armv4-linux.S index aa19360c..a65dd577 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-linux.linux.arm.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv4-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1855,7 +1854,6 @@ sha512_block_data_order_neon: .align 2 .align 2 #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) -#endif // defined(__arm__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-apple.S index a5b629fc..c4322431 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1596,7 +1595,6 @@ Loop_hw: #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-linux.S index 0a57a1b1..b5a9da51 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1596,7 +1595,6 @@ sha512_block_data_order_hw: .size sha512_block_data_order_hw,.-sha512_block_data_order_hw #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-win.S new file mode 100644 index 00000000..5de6ea53 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-win.S @@ -0,0 +1,1605 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +# include +#endif + +.text + +.globl sha512_block_data_order_nohw + +.def sha512_block_data_order_nohw + .type 32 +.endef +.align 6 +sha512_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adrp x30,LK512 + add x30,x30,:lo12:LK512 + stp x0,x2,[x29,#96] + +Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.section .rodata +.align 6 + +LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator + +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl sha512_block_data_order_hw + +.def sha512_block_data_order_hw + .type 32 +.endef +.align 6 +sha512_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context + adrp x3,LK512 + add x3,x3,:lo12:LK512 + + rev64 v16.16b,v16.16b + rev64 v17.16b,v17.16b + rev64 v18.16b,v18.16b + rev64 v19.16b,v19.16b + rev64 v20.16b,v20.16b + rev64 v21.16b,v21.16b + rev64 v22.16b,v22.16b + rev64 v23.16b,v23.16b + b Loop_hw + +.align 4 +Loop_hw: + ld1 {v24.2d},[x3],#16 + subs x2,x2,#1 + sub x4,x1,#128 + orr v26.16b,v0.16b,v0.16b // offload + orr v27.16b,v1.16b,v1.16b + orr v28.16b,v2.16b,v2.16b + orr v29.16b,v3.16b,v3.16b + csel x1,x1,x4,ne // conditional rewind + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v16.2d + ld1 {v16.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v16.16b,v16.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v17.2d + ld1 {v17.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v17.16b,v17.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v18.2d + ld1 {v18.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v18.16b,v18.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v19.2d + ld1 {v19.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + rev64 v19.16b,v19.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v20.2d + ld1 {v20.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + rev64 v20.16b,v20.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v21.2d + ld1 {v21.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v21.16b,v21.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v22.2d + ld1 {v22.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v22.16b,v22.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + sub x3,x3,#80*8 // rewind + add v25.2d,v25.2d,v23.2d + ld1 {v23.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v23.16b,v23.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v0.2d,v0.2d,v26.2d // accumulate + add v1.2d,v1.2d,v27.2d + add v2.2d,v2.2d,v28.2d + add v3.2d,v3.2d,v29.2d + + cbnz x2,Loop_hw + + st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context + + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-apple.S index a5b8b3dc..7bfe0bb6 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -2978,7 +2977,6 @@ L$epilogue_avx: #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-linux.S index c6f54669..2f8e9dce 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -2978,7 +2977,6 @@ _CET_ENDBR .cfi_endproc .size sha512_block_data_order_avx,.-sha512_block_data_order_avx #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv7-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-linux.linux.arm.S rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv7-linux.S index 42ae7354..d87e8703 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-linux.linux.arm.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv7-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1225,7 +1224,6 @@ vpaes_ctr32_encrypt_blocks: ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return .size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) -#endif // defined(__arm__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-apple.S index 8172671d..bf67728f 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1224,7 +1223,6 @@ Lctr32_done: ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-linux.S index 6a7e2c27..c690fffc 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1224,7 +1223,6 @@ vpaes_ctr32_encrypt_blocks: ret .size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-win.S new file mode 100644 index 00000000..d0ae73aa --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-win.S @@ -0,0 +1,1267 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include + +.section .rodata + + +.align 7 // totally strategic alignment +_vpaes_consts: +Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +Lk_dipt: // decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +Lk_dsbo: // decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +Lk_dsb9: // decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +Lk_dsbd: // decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +Lk_dsbb: // decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +Lk_dsbe: // decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 + +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.def _vpaes_encrypt_preheat + .type 32 +.endef +.align 4 +_vpaes_encrypt_preheat: + adrp x10, Lk_inv + add x10, x10, :lo12:Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 + ret + + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.def _vpaes_encrypt_core + .type 32 +.endef +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward+16 + add x11, x11, :lo12:Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Lenc_entry + +.align 4 +Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret + + +.globl vpaes_encrypt + +.def vpaes_encrypt + .type 32 +.endef +.align 4 +vpaes_encrypt: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.def _vpaes_encrypt_2x + .type 32 +.endef +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward+16 + add x11, x11, :lo12:Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Lenc_2x_entry + +.align 4 +Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret + + +.def _vpaes_decrypt_preheat + .type 32 +.endef +.align 4 +_vpaes_decrypt_preheat: + adrp x10, Lk_inv + add x10, x10, :lo12:Lk_inv + movi v17.16b, #0x0f + adrp x11, Lk_dipt + add x11, x11, :lo12:Lk_dipt + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe + ret + + +## +## Decryption core +## +## Same API as encryption core. +## +.def _vpaes_decrypt_core + .type 32 +.endef +.align 4 +_vpaes_decrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, Lk_sr + add x10, x10, :lo12:Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, Lk_mc_forward+48 + add x10, x10, :lo12:Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Ldec_entry + +.align 4 +Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub $1,%rax # nr-- + +Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret + + +.globl vpaes_decrypt + +.def vpaes_decrypt + .type 32 +.endef +.align 4 +vpaes_decrypt: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// v14-v15 input, v0-v1 output +.def _vpaes_decrypt_2x + .type 32 +.endef +.align 4 +_vpaes_decrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, Lk_sr + add x10, x10, :lo12:Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, Lk_mc_forward+48 + add x10, x10, :lo12:Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {v20.16b},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {v21.16b},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Ldec_2x_entry + +.align 4 +Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {v24.16b}, v10.16b + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {v25.16b}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {v26.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {v27.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {v28.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {v29.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {v30.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {v31.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub $1,%rax # nr-- + +Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {v23.16b}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.def _vpaes_key_preheat + .type 32 +.endef +.align 4 +_vpaes_key_preheat: + adrp x10, Lk_inv + add x10, x10, :lo12:Lk_inv + movi v16.16b, #0x5b // Lk_s63 + adrp x11, Lk_sb1 + add x11, x11, :lo12:Lk_sb1 + movi v17.16b, #0x0f // Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt + adrp x10, Lk_dksd + add x10, x10, :lo12:Lk_dksd + ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 + adrp x11, Lk_mc_forward + add x11, x11, :lo12:Lk_mc_forward + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 + ld1 {v8.2d}, [x10] // Lk_rcon + ld1 {v9.2d}, [x11] // Lk_mc_forward[0] + ret + + +.def _vpaes_schedule_core + .type 32 +.endef +.align 4 +_vpaes_schedule_core: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, Lk_sr // lea Lk_sr(%rip),%r10 + add x10, x10, :lo12:Lk_sr + + add x8, x8, x10 + cbnz w3, Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + b Lschedule_go + +Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor $0x30, %r8 + +Lschedule_go: + cmp w1, #192 // cmp $192, %esi + b.hi Lschedule_256 + b.eq Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +Lschedule_128: + mov x0, #10 // mov $10, %esi + +Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, Lk_deskew // lea Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, :lo12:Lk_deskew + + cbnz w3, Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, Lk_opt // lea Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, :lo12:Lk_opt + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.def _vpaes_schedule_192_smear + .type 32 +.endef +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret + + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.def _vpaes_schedule_round + .type 32 +.endef +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret + + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.def _vpaes_schedule_transform + .type 32 +.endef +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret + + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.def _vpaes_schedule_mangle + .type 32 +.endef +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz w3, Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b Lschedule_mangle_both +.align 4 +Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub x2, x2, #16 // add $-16, %rdx + +Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #48 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret + + +.globl vpaes_set_encrypt_key + +.def vpaes_set_encrypt_key + .type 32 +.endef +.align 4 +vpaes_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl vpaes_set_decrypt_key + +.def vpaes_set_decrypt_key + .type 32 +.endef +.align 4 +vpaes_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl $4,%eax + add x2, x2, #16 // lea 16(%rdx,%rax),%rdx + add x2, x2, x9 + + mov w3, #1 // mov $1,%ecx + lsr w8, w1, #1 // shr $1,%r8d + and x8, x8, #32 // and $32,%r8d + eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl vpaes_cbc_encrypt + +.def vpaes_cbc_encrypt + .type 32 +.endef +.align 4 +vpaes_cbc_encrypt: + AARCH64_SIGN_LINK_REGISTER + cbz x2, Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, x2 // reassign + mov x2, x3 // reassign + + ld1 {v0.16b}, [x4] // load ivec + bl _vpaes_encrypt_preheat + b Lcbc_enc_loop + +.align 4 +Lcbc_enc_loop: + ld1 {v7.16b}, [x0],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 // save output + subs x17, x17, #16 + b.hi Lcbc_enc_loop + + st1 {v0.16b}, [x4] // write ivec + + ldp x29,x30,[sp],#16 +Lcbc_abort: + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.def vpaes_cbc_decrypt + .type 32 +.endef +.align 4 +vpaes_cbc_decrypt: + // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to + // only from vpaes_cbc_encrypt which has already signed the return address. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 // reassign + mov x2, x3 // reassign + ld1 {v6.16b}, [x4] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq Lcbc_dec_loop2x + + ld1 {v7.16b}, [x0], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [x1], #16 + subs x17, x17, #16 + b.ls Lcbc_dec_done + +.align 4 +Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi Lcbc_dec_loop2x + +Lcbc_dec_done: + st1 {v6.16b}, [x4] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl vpaes_ctr32_encrypt_blocks + +.def vpaes_ctr32_encrypt_blocks + .type 32 +.endef +.align 4 +vpaes_ctr32_encrypt_blocks: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls Lctr32_done + +Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi Lctr32_loop + +Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-apple.S new file mode 100644 index 00000000..28f72d3b --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-apple.S @@ -0,0 +1,685 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.align 6,0x90 +L_vpaes_consts: +.long 218628480,235210255,168496130,67568393 +.long 252381056,17041926,33884169,51187212 +.long 252645135,252645135,252645135,252645135 +.long 1512730624,3266504856,1377990664,3401244816 +.long 830229760,1275146365,2969422977,3447763452 +.long 3411033600,2979783055,338359620,2782886510 +.long 4209124096,907596821,221174255,1006095553 +.long 191964160,3799684038,3164090317,1589111125 +.long 182528256,1777043520,2877432650,3265356744 +.long 1874708224,3503451415,3305285752,363511674 +.long 1606117888,3487855781,1093350906,2384367825 +.long 197121,67569157,134941193,202313229 +.long 67569157,134941193,202313229,197121 +.long 134941193,202313229,197121,67569157 +.long 202313229,197121,67569157,134941193 +.long 33619971,100992007,168364043,235736079 +.long 235736079,33619971,100992007,168364043 +.long 168364043,235736079,33619971,100992007 +.long 100992007,168364043,235736079,33619971 +.long 50462976,117835012,185207048,252579084 +.long 252314880,51251460,117574920,184942860 +.long 184682752,252054788,50987272,118359308 +.long 118099200,185467140,251790600,50727180 +.long 2946363062,528716217,1300004225,1881839624 +.long 1532713819,1532713819,1532713819,1532713819 +.long 3602276352,4288629033,3737020424,4153884961 +.long 1354558464,32357713,2958822624,3775749553 +.long 1201988352,132424512,1572796698,503232858 +.long 2213177600,1597421020,4103937655,675398315 +.long 2749646592,4273543773,1511898873,121693092 +.long 3040248576,1103263732,2871565598,1608280554 +.long 2236667136,2588920351,482954393,64377734 +.long 3069987328,291237287,2117370568,3650299247 +.long 533321216,3573750986,2572112006,1401264716 +.long 1339849704,2721158661,548607111,3445553514 +.long 2128193280,3054596040,2183486460,1257083700 +.long 655635200,1165381986,3923443150,2344132524 +.long 190078720,256924420,290342170,357187870 +.long 1610966272,2263057382,4103205268,309794674 +.long 2592527872,2233205587,1335446729,3402964816 +.long 3973531904,3225098121,3002836325,1918774430 +.long 3870401024,2102906079,2284471353,4117666579 +.long 617007872,1021508343,366931923,691083277 +.long 2528395776,3491914898,2968704004,1613121270 +.long 3445188352,3247741094,844474987,4093578302 +.long 651481088,1190302358,1689581232,574775300 +.long 4289380608,206939853,2555985458,2489840491 +.long 2130264064,327674451,3566485037,3349835193 +.long 2470714624,316102159,3636825756,3393945945 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +.byte 118,101,114,115,105,116,121,41,0 +.align 6,0x90 +.private_extern __vpaes_preheat +.align 4 +__vpaes_preheat: + addl (%esp),%ebp + movdqa -48(%ebp),%xmm7 + movdqa -16(%ebp),%xmm6 + ret +.private_extern __vpaes_encrypt_core +.align 4 +__vpaes_encrypt_core: + movl $16,%ecx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa (%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 +.byte 102,15,56,0,208 + movdqa 16(%ebp),%xmm0 + pxor %xmm5,%xmm2 + psrld $4,%xmm1 + addl $16,%edx +.byte 102,15,56,0,193 + leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 + jmp L000enc_entry +.align 4,0x90 +L001enc_loop: + movdqa 32(%ebp),%xmm4 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa 64(%ebp),%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%ebx,%ecx,1),%xmm1 +.byte 102,15,56,0,234 + movdqa 80(%ebp),%xmm2 + movdqa (%ebx,%ecx,1),%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addl $16,%edx + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addl $16,%ecx + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andl $48,%ecx + subl $1,%eax + pxor %xmm3,%xmm0 +L000enc_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm6,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm5 + pxor %xmm1,%xmm3 + jnz L001enc_loop + movdqa 96(%ebp),%xmm4 + movdqa 112(%ebp),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%ebx,%ecx,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + ret +.private_extern __vpaes_decrypt_core +.align 4 +__vpaes_decrypt_core: + leal 608(%ebp),%ebx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa -64(%ebx),%xmm2 + pandn %xmm0,%xmm1 + movl %eax,%ecx + psrld $4,%xmm1 + movdqu (%edx),%xmm5 + shll $4,%ecx + pand %xmm6,%xmm0 +.byte 102,15,56,0,208 + movdqa -48(%ebx),%xmm0 + xorl $48,%ecx +.byte 102,15,56,0,193 + andl $48,%ecx + pxor %xmm5,%xmm2 + movdqa 176(%ebp),%xmm5 + pxor %xmm2,%xmm0 + addl $16,%edx + leal -352(%ebx,%ecx,1),%ecx + jmp L002dec_entry +.align 4,0x90 +L003dec_loop: + movdqa -32(%ebx),%xmm4 + movdqa -16(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa (%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addl $16,%edx +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subl $1,%eax +L002dec_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + psrld $4,%xmm1 +.byte 102,15,56,0,208 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm7,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm0 + pxor %xmm1,%xmm3 + jnz L003dec_loop + movdqa 96(%ebx),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%ebx),%xmm0 + movdqa (%ecx),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + ret +.private_extern __vpaes_schedule_core +.align 4 +__vpaes_schedule_core: + addl (%esp),%ebp + movdqu (%esi),%xmm0 + movdqa 320(%ebp),%xmm2 + movdqa %xmm0,%xmm3 + leal (%ebp),%ebx + movdqa %xmm2,4(%esp) + call __vpaes_schedule_transform + movdqa %xmm0,%xmm7 + testl %edi,%edi + jnz L004schedule_am_decrypting + movdqu %xmm0,(%edx) + jmp L005schedule_go +L004schedule_am_decrypting: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%edx) + xorl $48,%ecx +L005schedule_go: + cmpl $192,%eax + ja L006schedule_256 + je L007schedule_192 +L008schedule_128: + movl $10,%eax +L009loop_schedule_128: + call __vpaes_schedule_round + decl %eax + jz L010schedule_mangle_last + call __vpaes_schedule_mangle + jmp L009loop_schedule_128 +.align 4,0x90 +L007schedule_192: + movdqu 8(%esi),%xmm0 + call __vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%eax +L011loop_schedule_192: + call __vpaes_schedule_round +.byte 102,15,58,15,198,8 + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + call __vpaes_schedule_mangle + call __vpaes_schedule_round + decl %eax + jz L010schedule_mangle_last + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + jmp L011loop_schedule_192 +.align 4,0x90 +L006schedule_256: + movdqu 16(%esi),%xmm0 + call __vpaes_schedule_transform + movl $7,%eax +L012loop_schedule_256: + call __vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + call __vpaes_schedule_round + decl %eax + jz L010schedule_mangle_last + call __vpaes_schedule_mangle + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,20(%esp) + movdqa %xmm6,%xmm7 + call L_vpaes_schedule_low_round + movdqa 20(%esp),%xmm7 + jmp L012loop_schedule_256 +.align 4,0x90 +L010schedule_mangle_last: + leal 384(%ebp),%ebx + testl %edi,%edi + jnz L013schedule_mangle_last_dec + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,193 + leal 352(%ebp),%ebx + addl $32,%edx +L013schedule_mangle_last_dec: + addl $-16,%edx + pxor 336(%ebp),%xmm0 + call __vpaes_schedule_transform + movdqu %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret +.private_extern __vpaes_schedule_192_smear +.align 4 +__vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + ret +.private_extern __vpaes_schedule_round +.align 4 +__vpaes_schedule_round: + movdqa 8(%esp),%xmm2 + pxor %xmm1,%xmm1 +.byte 102,15,58,15,202,15 +.byte 102,15,58,15,210,15 + pxor %xmm1,%xmm7 + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + movdqa %xmm2,8(%esp) +L_vpaes_schedule_low_round: + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor 336(%ebp),%xmm7 + movdqa -16(%ebp),%xmm4 + movdqa -48(%ebp),%xmm5 + movdqa %xmm4,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm4,%xmm0 + movdqa -32(%ebp),%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm5,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm5,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa 32(%ebp),%xmm4 +.byte 102,15,56,0,226 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret +.private_extern __vpaes_schedule_transform +.align 4 +__vpaes_schedule_transform: + movdqa -16(%ebp),%xmm2 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + movdqa (%ebx),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%ebx),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + ret +.private_extern __vpaes_schedule_mangle +.align 4 +__vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa 128(%ebp),%xmm5 + testl %edi,%edi + jnz L014schedule_mangle_dec + addl $16,%edx + pxor 336(%ebp),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + jmp L015schedule_mangle_both +.align 4,0x90 +L014schedule_mangle_dec: + movdqa -16(%ebp),%xmm2 + leal 416(%ebp),%esi + movdqa %xmm2,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm4 + movdqa (%esi),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 32(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 64(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 96(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + addl $-16,%edx +L015schedule_mangle_both: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + addl $-16,%ecx + andl $48,%ecx + movdqu %xmm3,(%edx) + ret +.globl _vpaes_set_encrypt_key +.private_extern _vpaes_set_encrypt_key +.align 4 +_vpaes_set_encrypt_key: +L_vpaes_set_encrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L016pic_for_function_hit +L016pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+5-L016pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + movl $48,%ecx + movl $0,%edi + leal L_vpaes_consts+0x30-L017pic_point,%ebp + call __vpaes_schedule_core +L017pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_set_decrypt_key +.private_extern _vpaes_set_decrypt_key +.align 4 +_vpaes_set_decrypt_key: +L_vpaes_set_decrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + shll $4,%ebx + leal 16(%edx,%ebx,1),%edx + movl $1,%edi + movl %eax,%ecx + shrl $1,%ecx + andl $32,%ecx + xorl $32,%ecx + leal L_vpaes_consts+0x30-L018pic_point,%ebp + call __vpaes_schedule_core +L018pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_encrypt +.private_extern _vpaes_encrypt +.align 4 +_vpaes_encrypt: +L_vpaes_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L019pic_for_function_hit +L019pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+4-L019pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + leal L_vpaes_consts+0x30-L020pic_point,%ebp + call __vpaes_preheat +L020pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call __vpaes_encrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_decrypt +.private_extern _vpaes_decrypt +.align 4 +_vpaes_decrypt: +L_vpaes_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal L_vpaes_consts+0x30-L021pic_point,%ebp + call __vpaes_preheat +L021pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call __vpaes_decrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_cbc_encrypt +.private_extern _vpaes_cbc_encrypt +.align 4 +_vpaes_cbc_encrypt: +L_vpaes_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + subl $16,%eax + jc L022cbc_abort + leal -56(%esp),%ebx + movl 36(%esp),%ebp + andl $-16,%ebx + movl 40(%esp),%ecx + xchgl %esp,%ebx + movdqu (%ebp),%xmm1 + subl %esi,%edi + movl %ebx,48(%esp) + movl %edi,(%esp) + movl %edx,4(%esp) + movl %ebp,8(%esp) + movl %eax,%edi + leal L_vpaes_consts+0x30-L023pic_point,%ebp + call __vpaes_preheat +L023pic_point: + cmpl $0,%ecx + je L024cbc_dec_loop + jmp L025cbc_enc_loop +.align 4,0x90 +L025cbc_enc_loop: + movdqu (%esi),%xmm0 + pxor %xmm1,%xmm0 + call __vpaes_encrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + movdqa %xmm0,%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc L025cbc_enc_loop + jmp L026cbc_done +.align 4,0x90 +L024cbc_dec_loop: + movdqu (%esi),%xmm0 + movdqa %xmm1,16(%esp) + movdqa %xmm0,32(%esp) + call __vpaes_decrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + pxor 16(%esp),%xmm0 + movdqa 32(%esp),%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc L024cbc_dec_loop +L026cbc_done: + movl 8(%esp),%ebx + movl 48(%esp),%esp + movdqu %xmm1,(%ebx) +L022cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-linux.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-linux.S index e1f6de82..516d092d 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -489,10 +488,10 @@ vpaes_set_encrypt_key: #ifdef BORINGSSL_DISPATCH_TEST pushl %ebx pushl %edx - call .L016pic -.L016pic: + call .L016pic_for_function_hit +.L016pic_for_function_hit: popl %ebx - leal BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx + leal BORINGSSL_function_hit+5-.L016pic_for_function_hit(%ebx),%ebx movl $1,%edx movb %dl,(%ebx) popl %edx @@ -574,10 +573,10 @@ vpaes_encrypt: #ifdef BORINGSSL_DISPATCH_TEST pushl %ebx pushl %edx - call .L019pic -.L019pic: + call .L019pic_for_function_hit +.L019pic_for_function_hit: popl %ebx - leal BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx + leal BORINGSSL_function_hit+4-.L019pic_for_function_hit(%ebx),%ebx movl $1,%edx movb %dl,(%ebx) popl %edx @@ -706,7 +705,6 @@ vpaes_cbc_encrypt: ret .size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-apple.S index 1b126a6c..32a84f12 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1131,7 +1130,6 @@ L$ctr_add_two: .text #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-linux.S index bc55da78..2a491c84 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1133,7 +1132,6 @@ _vpaes_consts: .size _vpaes_consts,.-_vpaes_consts .text #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-apple.S new file mode 100644 index 00000000..81253d99 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-apple.S @@ -0,0 +1,226 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _bn_mul_mont +.private_extern _bn_mul_mont +.align 4 +_bn_mul_mont: +L_bn_mul_mont_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + xorl %eax,%eax + movl 40(%esp),%edi + cmpl $4,%edi + jl L000just_leave + leal 20(%esp),%esi + leal 24(%esp),%edx + addl $2,%edi + negl %edi + leal -32(%esp,%edi,4),%ebp + negl %edi + movl %ebp,%eax + subl %edx,%eax + andl $2047,%eax + subl %eax,%ebp + xorl %ebp,%edx + andl $2048,%edx + xorl $2048,%edx + subl %edx,%ebp + andl $-64,%ebp + movl %esp,%eax + subl %ebp,%eax + andl $-4096,%eax + movl %esp,%edx + leal (%ebp,%eax,1),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja L001page_walk + jmp L002page_walk_done +.align 4,0x90 +L001page_walk: + leal -4096(%esp),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja L001page_walk +L002page_walk_done: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%ebp + movl 16(%esi),%esi + movl (%esi),%esi + movl %eax,4(%esp) + movl %ebx,8(%esp) + movl %ecx,12(%esp) + movl %ebp,16(%esp) + movl %esi,20(%esp) + leal -3(%edi),%ebx + movl %edx,24(%esp) + movl $-1,%eax + movd %eax,%mm7 + movl 8(%esp),%esi + movl 12(%esp),%edi + movl 16(%esp),%ebp + xorl %edx,%edx + xorl %ecx,%ecx + movd (%edi),%mm4 + movd (%esi),%mm5 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + movq %mm5,%mm2 + movq %mm5,%mm0 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + incl %ecx +.align 4,0x90 +L0031st: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + leal 1(%ecx),%ecx + cmpl %ebx,%ecx + jl L0031st + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm2,%mm3 + movq %mm3,32(%esp,%ebx,4) + incl %edx +L004outer: + xorl %ecx,%ecx + movd (%edi,%edx,4),%mm4 + movd (%esi),%mm5 + movd 32(%esp),%mm6 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + paddq %mm6,%mm5 + movq %mm5,%mm0 + movq %mm5,%mm2 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 36(%esp),%mm6 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm6,%mm2 + incl %ecx + decl %ebx +L005inner: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + movd 36(%esp,%ecx,4),%mm6 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + paddq %mm6,%mm2 + decl %ebx + leal 1(%ecx),%ecx + jnz L005inner + movl %ecx,%ebx + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + movd 36(%esp,%ebx,4),%mm6 + paddq %mm2,%mm3 + paddq %mm6,%mm3 + movq %mm3,32(%esp,%ebx,4) + leal 1(%edx),%edx + cmpl %ebx,%edx + jle L004outer + emms + jmp L006common_tail +.align 4,0x90 +L006common_tail: + movl 16(%esp),%ebp + movl 4(%esp),%edi + leal 32(%esp),%esi + movl (%esi),%eax + movl %ebx,%ecx + xorl %edx,%edx +.align 4,0x90 +L007sub: + sbbl (%ebp,%edx,4),%eax + movl %eax,(%edi,%edx,4) + decl %ecx + movl 4(%esi,%edx,4),%eax + leal 1(%edx),%edx + jge L007sub + sbbl $0,%eax + movl $-1,%edx + xorl %eax,%edx + jmp L008copy +.align 4,0x90 +L008copy: + movl 32(%esp,%ebx,4),%esi + movl (%edi,%ebx,4),%ebp + movl %ecx,32(%esp,%ebx,4) + andl %eax,%esi + andl %edx,%ebp + orl %esi,%ebp + movl %ebp,(%edi,%ebx,4) + decl %ebx + jge L008copy + movl 24(%esp),%esp + movl $1,%eax +L000just_leave: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 +.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +.byte 111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-linux.S b/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-linux.S new file mode 100644 index 00000000..c0e0ef07 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-linux.S @@ -0,0 +1,228 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl bn_mul_mont +.hidden bn_mul_mont +.type bn_mul_mont,@function +.align 16 +bn_mul_mont: +.L_bn_mul_mont_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + xorl %eax,%eax + movl 40(%esp),%edi + cmpl $4,%edi + jl .L000just_leave + leal 20(%esp),%esi + leal 24(%esp),%edx + addl $2,%edi + negl %edi + leal -32(%esp,%edi,4),%ebp + negl %edi + movl %ebp,%eax + subl %edx,%eax + andl $2047,%eax + subl %eax,%ebp + xorl %ebp,%edx + andl $2048,%edx + xorl $2048,%edx + subl %edx,%ebp + andl $-64,%ebp + movl %esp,%eax + subl %ebp,%eax + andl $-4096,%eax + movl %esp,%edx + leal (%ebp,%eax,1),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja .L001page_walk + jmp .L002page_walk_done +.align 16 +.L001page_walk: + leal -4096(%esp),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja .L001page_walk +.L002page_walk_done: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%ebp + movl 16(%esi),%esi + movl (%esi),%esi + movl %eax,4(%esp) + movl %ebx,8(%esp) + movl %ecx,12(%esp) + movl %ebp,16(%esp) + movl %esi,20(%esp) + leal -3(%edi),%ebx + movl %edx,24(%esp) + movl $-1,%eax + movd %eax,%mm7 + movl 8(%esp),%esi + movl 12(%esp),%edi + movl 16(%esp),%ebp + xorl %edx,%edx + xorl %ecx,%ecx + movd (%edi),%mm4 + movd (%esi),%mm5 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + movq %mm5,%mm2 + movq %mm5,%mm0 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + incl %ecx +.align 16 +.L0031st: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + leal 1(%ecx),%ecx + cmpl %ebx,%ecx + jl .L0031st + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm2,%mm3 + movq %mm3,32(%esp,%ebx,4) + incl %edx +.L004outer: + xorl %ecx,%ecx + movd (%edi,%edx,4),%mm4 + movd (%esi),%mm5 + movd 32(%esp),%mm6 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + paddq %mm6,%mm5 + movq %mm5,%mm0 + movq %mm5,%mm2 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 36(%esp),%mm6 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm6,%mm2 + incl %ecx + decl %ebx +.L005inner: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + movd 36(%esp,%ecx,4),%mm6 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + paddq %mm6,%mm2 + decl %ebx + leal 1(%ecx),%ecx + jnz .L005inner + movl %ecx,%ebx + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + movd 36(%esp,%ebx,4),%mm6 + paddq %mm2,%mm3 + paddq %mm6,%mm3 + movq %mm3,32(%esp,%ebx,4) + leal 1(%edx),%edx + cmpl %ebx,%edx + jle .L004outer + emms + jmp .L006common_tail +.align 16 +.L006common_tail: + movl 16(%esp),%ebp + movl 4(%esp),%edi + leal 32(%esp),%esi + movl (%esi),%eax + movl %ebx,%ecx + xorl %edx,%edx +.align 16 +.L007sub: + sbbl (%ebp,%edx,4),%eax + movl %eax,(%edi,%edx,4) + decl %ecx + movl 4(%esi,%edx,4),%eax + leal 1(%edx),%edx + jge .L007sub + sbbl $0,%eax + movl $-1,%edx + xorl %eax,%edx + jmp .L008copy +.align 16 +.L008copy: + movl 32(%esp,%ebx,4),%esi + movl (%edi,%ebx,4),%ebp + movl %ecx,32(%esp,%ebx,4) + andl %eax,%esi + andl %edx,%ebp + orl %esi,%ebp + movl %ebp,(%edi,%ebx,4) + decl %ebx + jge .L008copy + movl 24(%esp),%esp + movl $1,%eax +.L000just_leave: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size bn_mul_mont,.-.L_bn_mul_mont_begin +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 +.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +.byte 111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-apple.S index eb39df37..a6e7a2c8 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1235,7 +1234,6 @@ L$mulx4x_epilogue: .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-linux.S index 22565fe1..9a500d70 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1237,7 +1236,6 @@ _CET_ENDBR .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-apple.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-apple.S index d3a698ca..4b003639 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -8,26 +7,18 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text - - -.globl _bn_mul_mont_gather5 -.private_extern _bn_mul_mont_gather5 +.globl _bn_mul_mont_gather5_nohw +.private_extern _bn_mul_mont_gather5_nohw .p2align 6 -_bn_mul_mont_gather5: +_bn_mul_mont_gather5_nohw: _CET_ENDBR + + movl %r9d,%r9d movq %rsp,%rax - testl $7,%r9d - jnz L$mul_enter - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - jmp L$mul4x_enter - -.p2align 4 -L$mul_enter: movd 8(%rsp),%xmm5 pushq %rbx @@ -454,17 +445,16 @@ L$mul_epilogue: ret +.globl _bn_mul4x_mont_gather5 +.private_extern _bn_mul4x_mont_gather5 .p2align 5 -bn_mul4x_mont_gather5: +_bn_mul4x_mont_gather5: +_CET_ENDBR .byte 0x67 movq %rsp,%rax -L$mul4x_enter: - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je L$mulx4x_enter pushq %rbx pushq %rbp @@ -480,6 +470,9 @@ L$mul4x_enter: L$mul4x_prologue: .byte 0x67 + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -1089,20 +1082,15 @@ L$inner4x: jmp L$sqr4x_sub_entry -.globl _bn_power5 -.private_extern _bn_power5 +.globl _bn_power5_nohw +.private_extern _bn_power5_nohw .p2align 5 -_bn_power5: +_bn_power5_nohw: _CET_ENDBR movq %rsp,%rax - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je L$powerx5_enter pushq %rbx pushq %rbp @@ -1117,6 +1105,9 @@ _CET_ENDBR L$power5_prologue: + + + shll $3,%r9d leal (%r9,%r9,2),%r10d negq %r9 @@ -2068,13 +2059,15 @@ L$sqr4x_sub_entry: ret +.globl _bn_mulx4x_mont_gather5 +.private_extern _bn_mulx4x_mont_gather5 .p2align 5 -bn_mulx4x_mont_gather5: +_bn_mulx4x_mont_gather5: +_CET_ENDBR movq %rsp,%rax -L$mulx4x_enter: pushq %rbx pushq %rbp @@ -2089,6 +2082,9 @@ L$mulx4x_enter: L$mulx4x_prologue: + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -2605,13 +2601,15 @@ L$mulx4x_inner: jmp L$sqrx4x_sub_entry +.globl _bn_powerx5 +.private_extern _bn_powerx5 .p2align 5 -bn_powerx5: +_bn_powerx5: +_CET_ENDBR movq %rsp,%rax -L$powerx5_enter: pushq %rbx pushq %rbp @@ -2626,6 +2624,9 @@ L$powerx5_enter: L$powerx5_prologue: + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -3624,7 +3625,6 @@ L$inc: .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-linux.S similarity index 98% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-linux.S index 894fe5fd..6cec7db0 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -8,27 +7,18 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -.globl bn_mul_mont_gather5 -.hidden bn_mul_mont_gather5 -.type bn_mul_mont_gather5,@function +.globl bn_mul_mont_gather5_nohw +.hidden bn_mul_mont_gather5_nohw +.type bn_mul_mont_gather5_nohw,@function .align 64 -bn_mul_mont_gather5: +bn_mul_mont_gather5_nohw: .cfi_startproc _CET_ENDBR + + movl %r9d,%r9d movq %rsp,%rax .cfi_def_cfa_register %rax - testl $7,%r9d - jnz .Lmul_enter - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - jmp .Lmul4x_enter - -.align 16 -.Lmul_enter: movd 8(%rsp),%xmm5 pushq %rbx .cfi_offset %rbx,-16 @@ -454,18 +444,17 @@ _CET_ENDBR .Lmul_epilogue: ret .cfi_endproc -.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 +.size bn_mul_mont_gather5_nohw,.-bn_mul_mont_gather5_nohw +.globl bn_mul4x_mont_gather5 +.hidden bn_mul4x_mont_gather5 .type bn_mul4x_mont_gather5,@function .align 32 bn_mul4x_mont_gather5: .cfi_startproc +_CET_ENDBR .byte 0x67 movq %rsp,%rax .cfi_def_cfa_register %rax -.Lmul4x_enter: - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -481,6 +470,9 @@ bn_mul4x_mont_gather5: .Lmul4x_prologue: .byte 0x67 + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -1090,20 +1082,15 @@ mul4x_internal: jmp .Lsqr4x_sub_entry .cfi_endproc .size mul4x_internal,.-mul4x_internal -.globl bn_power5 -.hidden bn_power5 -.type bn_power5,@function +.globl bn_power5_nohw +.hidden bn_power5_nohw +.type bn_power5_nohw,@function .align 32 -bn_power5: +bn_power5_nohw: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je .Lpowerx5_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -1118,6 +1105,9 @@ _CET_ENDBR .cfi_offset %r15,-56 .Lpower5_prologue: + + + shll $3,%r9d leal (%r9,%r9,2),%r10d negq %r9 @@ -1226,7 +1216,7 @@ _CET_ENDBR .Lpower5_epilogue: ret .cfi_endproc -.size bn_power5,.-bn_power5 +.size bn_power5_nohw,.-bn_power5_nohw .globl bn_sqr8x_internal .hidden bn_sqr8x_internal @@ -2069,13 +2059,15 @@ __bn_post4x_internal: ret .cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal +.globl bn_mulx4x_mont_gather5 +.hidden bn_mulx4x_mont_gather5 .type bn_mulx4x_mont_gather5,@function .align 32 bn_mulx4x_mont_gather5: .cfi_startproc +_CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax -.Lmulx4x_enter: pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -2090,6 +2082,9 @@ bn_mulx4x_mont_gather5: .cfi_offset %r15,-56 .Lmulx4x_prologue: + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -2606,13 +2601,15 @@ mulx4x_internal: jmp .Lsqrx4x_sub_entry .cfi_endproc .size mulx4x_internal,.-mulx4x_internal +.globl bn_powerx5 +.hidden bn_powerx5 .type bn_powerx5,@function .align 32 bn_powerx5: .cfi_startproc +_CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax -.Lpowerx5_enter: pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -2627,6 +2624,9 @@ bn_powerx5: .cfi_offset %r15,-56 .Lpowerx5_prologue: + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -3625,7 +3625,6 @@ _CET_ENDBR .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-apple.S index 9b474516..54a78781 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -3081,7 +3080,6 @@ _CET_ENDBR #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-linux.S index 67d9f8ab..cd46241a 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -3091,7 +3090,6 @@ _CET_ENDBR .cfi_endproc .size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv4-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-linux.linux.arm.S rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-armv4-linux.S index 9a546d9c..75cd8b27 100644 --- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-linux.linux.arm.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv4-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__arm__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1451,7 +1450,6 @@ ChaCha20_ctr32_neon: .size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) -#endif // defined(__arm__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-apple.S index 675f6a33..d82a7d53 100644 --- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1968,7 +1967,6 @@ Ldone_512_neon: ret #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-linux.S index 983f3af0..183a3b79 100644 --- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -1968,7 +1967,6 @@ ChaCha20_512_neon: ret .size ChaCha20_512_neon,.-ChaCha20_512_neon #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-win.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-win.S new file mode 100644 index 00000000..3d766f37 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-win.S @@ -0,0 +1,1979 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include + +.section .rodata + +.align 5 +Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +Lone: +.long 1,0,0,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.text + +.globl ChaCha20_ctr32_nohw + +.def ChaCha20_ctr32_nohw + .type 32 +.endef +.align 5 +ChaCha20_ctr32_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma + add x5,x5,:lo12:Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __AARCH64EB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.align 4 +Ltail: + add x2,x2,#64 +Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl ChaCha20_ctr32_neon + +.def ChaCha20_ctr32_neon + .type 32 +.endef +.align 5 +ChaCha20_ctr32_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma + add x5,x5,:lo12:Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b Last_neon + +Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b Last_neon +Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b Last_neon + +.align 4 +Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.def ChaCha20_512_neon + .type 32 +.endef +.align 5 +ChaCha20_512_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma + add x5,x5,:lo12:Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b Loop_outer + +Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-apple.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-apple.S new file mode 100644 index 00000000..997b2a2c --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-apple.S @@ -0,0 +1,962 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _ChaCha20_ctr32_nohw +.private_extern _ChaCha20_ctr32_nohw +.align 4 +_ChaCha20_ctr32_nohw: +L_ChaCha20_ctr32_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 32(%esp),%esi + movl 36(%esp),%edi + subl $132,%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,80(%esp) + movl %ebx,84(%esp) + movl %ecx,88(%esp) + movl %edx,92(%esp) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + movl %eax,96(%esp) + movl %ebx,100(%esp) + movl %ecx,104(%esp) + movl %edx,108(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + subl $1,%eax + movl %eax,112(%esp) + movl %ebx,116(%esp) + movl %ecx,120(%esp) + movl %edx,124(%esp) + jmp L000entry +.align 4,0x90 +L001outer_loop: + movl %ebx,156(%esp) + movl %eax,152(%esp) + movl %ecx,160(%esp) +L000entry: + movl $1634760805,%eax + movl $857760878,4(%esp) + movl $2036477234,8(%esp) + movl $1797285236,12(%esp) + movl 84(%esp),%ebx + movl 88(%esp),%ebp + movl 104(%esp),%ecx + movl 108(%esp),%esi + movl 116(%esp),%edx + movl 120(%esp),%edi + movl %ebx,20(%esp) + movl %ebp,24(%esp) + movl %ecx,40(%esp) + movl %esi,44(%esp) + movl %edx,52(%esp) + movl %edi,56(%esp) + movl 92(%esp),%ebx + movl 124(%esp),%edi + movl 112(%esp),%edx + movl 80(%esp),%ebp + movl 96(%esp),%ecx + movl 100(%esp),%esi + addl $1,%edx + movl %ebx,28(%esp) + movl %edi,60(%esp) + movl %edx,112(%esp) + movl $10,%ebx + jmp L002loop +.align 4,0x90 +L002loop: + addl %ebp,%eax + movl %ebx,128(%esp) + movl %ebp,%ebx + xorl %eax,%edx + roll $16,%edx + addl %edx,%ecx + xorl %ecx,%ebx + movl 52(%esp),%edi + roll $12,%ebx + movl 20(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,48(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,32(%esp) + roll $16,%edi + movl %ebx,16(%esp) + addl %edi,%esi + movl 40(%esp),%ecx + xorl %esi,%ebp + movl 56(%esp),%edx + roll $12,%ebp + movl 24(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,52(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,36(%esp) + roll $16,%edx + movl %ebp,20(%esp) + addl %edx,%ecx + movl 44(%esp),%esi + xorl %ecx,%ebx + movl 60(%esp),%edi + roll $12,%ebx + movl 28(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,56(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,24(%esp) + addl %edi,%esi + xorl %esi,%ebp + roll $12,%ebp + movl 20(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,%edx + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + roll $16,%edx + movl %ebp,28(%esp) + addl %edx,%ecx + xorl %ecx,%ebx + movl 48(%esp),%edi + roll $12,%ebx + movl 24(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,60(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,40(%esp) + roll $16,%edi + movl %ebx,20(%esp) + addl %edi,%esi + movl 32(%esp),%ecx + xorl %esi,%ebp + movl 52(%esp),%edx + roll $12,%ebp + movl 28(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,48(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,44(%esp) + roll $16,%edx + movl %ebp,24(%esp) + addl %edx,%ecx + movl 36(%esp),%esi + xorl %ecx,%ebx + movl 56(%esp),%edi + roll $12,%ebx + movl 16(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,52(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,28(%esp) + addl %edi,%esi + xorl %esi,%ebp + movl 48(%esp),%edx + roll $12,%ebp + movl 128(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,56(%esp) + xorl %esi,%ebp + roll $7,%ebp + decl %ebx + jnz L002loop + movl 160(%esp),%ebx + addl $1634760805,%eax + addl 80(%esp),%ebp + addl 96(%esp),%ecx + addl 100(%esp),%esi + cmpl $64,%ebx + jb L003tail + movl 156(%esp),%ebx + addl 112(%esp),%edx + addl 120(%esp),%edi + xorl (%ebx),%eax + xorl 16(%ebx),%ebp + movl %eax,(%esp) + movl 152(%esp),%eax + xorl 32(%ebx),%ecx + xorl 36(%ebx),%esi + xorl 48(%ebx),%edx + xorl 56(%ebx),%edi + movl %ebp,16(%eax) + movl %ecx,32(%eax) + movl %esi,36(%eax) + movl %edx,48(%eax) + movl %edi,56(%eax) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + xorl 4(%ebx),%ebp + xorl 8(%ebx),%ecx + xorl 12(%ebx),%esi + xorl 20(%ebx),%edx + xorl 24(%ebx),%edi + movl %ebp,4(%eax) + movl %ecx,8(%eax) + movl %esi,12(%eax) + movl %edx,20(%eax) + movl %edi,24(%eax) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + xorl 28(%ebx),%ebp + xorl 40(%ebx),%ecx + xorl 44(%ebx),%esi + xorl 52(%ebx),%edx + xorl 60(%ebx),%edi + leal 64(%ebx),%ebx + movl %ebp,28(%eax) + movl (%esp),%ebp + movl %ecx,40(%eax) + movl 160(%esp),%ecx + movl %esi,44(%eax) + movl %edx,52(%eax) + movl %edi,60(%eax) + movl %ebp,(%eax) + leal 64(%eax),%eax + subl $64,%ecx + jnz L001outer_loop + jmp L004done +L003tail: + addl 112(%esp),%edx + addl 120(%esp),%edi + movl %eax,(%esp) + movl %ebp,16(%esp) + movl %ecx,32(%esp) + movl %esi,36(%esp) + movl %edx,48(%esp) + movl %edi,56(%esp) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + movl %ebp,4(%esp) + movl %ecx,8(%esp) + movl %esi,12(%esp) + movl %edx,20(%esp) + movl %edi,24(%esp) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + movl %ebp,28(%esp) + movl 156(%esp),%ebp + movl %ecx,40(%esp) + movl 152(%esp),%ecx + movl %esi,44(%esp) + xorl %esi,%esi + movl %edx,52(%esp) + movl %edi,60(%esp) + xorl %eax,%eax + xorl %edx,%edx +L005tail_loop: + movb (%esi,%ebp,1),%al + movb (%esp,%esi,1),%dl + leal 1(%esi),%esi + xorb %dl,%al + movb %al,-1(%ecx,%esi,1) + decl %ebx + jnz L005tail_loop +L004done: + addl $132,%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _ChaCha20_ctr32_ssse3 +.private_extern _ChaCha20_ctr32_ssse3 +.align 4 +_ChaCha20_ctr32_ssse3: +L_ChaCha20_ctr32_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call Lpic_point +Lpic_point: + popl %eax + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal Lssse3_data-Lpic_point(%eax),%eax + movdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb L0061x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + movdqu (%edx),%xmm7 + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + paddd 48(%eax),%xmm0 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + psubd 64(%eax),%xmm0 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,64(%ebp) + movdqa %xmm1,80(%ebp) + movdqa %xmm2,96(%ebp) + movdqa %xmm3,112(%ebp) + movdqu 16(%edx),%xmm3 + movdqa %xmm4,-64(%ebp) + movdqa %xmm5,-48(%ebp) + movdqa %xmm6,-32(%ebp) + movdqa %xmm7,-16(%ebp) + movdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,(%ebp) + movdqa %xmm1,16(%ebp) + movdqa %xmm2,32(%ebp) + movdqa %xmm3,48(%ebp) + movdqa %xmm4,-128(%ebp) + movdqa %xmm5,-112(%ebp) + movdqa %xmm6,-96(%ebp) + movdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp L007outer_loop +.align 4,0x90 +L007outer_loop: + movdqa -112(%ebp),%xmm1 + movdqa -96(%ebp),%xmm2 + movdqa -80(%ebp),%xmm3 + movdqa -48(%ebp),%xmm5 + movdqa -32(%ebp),%xmm6 + movdqa -16(%ebp),%xmm7 + movdqa %xmm1,-112(%ebx) + movdqa %xmm2,-96(%ebx) + movdqa %xmm3,-80(%ebx) + movdqa %xmm5,-48(%ebx) + movdqa %xmm6,-32(%ebx) + movdqa %xmm7,-16(%ebx) + movdqa 32(%ebp),%xmm2 + movdqa 48(%ebp),%xmm3 + movdqa 64(%ebp),%xmm4 + movdqa 80(%ebp),%xmm5 + movdqa 96(%ebp),%xmm6 + movdqa 112(%ebp),%xmm7 + paddd 64(%eax),%xmm4 + movdqa %xmm2,32(%ebx) + movdqa %xmm3,48(%ebx) + movdqa %xmm4,64(%ebx) + movdqa %xmm5,80(%ebx) + movdqa %xmm6,96(%ebx) + movdqa %xmm7,112(%ebx) + movdqa %xmm4,64(%ebp) + movdqa -128(%ebp),%xmm0 + movdqa %xmm4,%xmm6 + movdqa -64(%ebp),%xmm3 + movdqa (%ebp),%xmm4 + movdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 4,0x90 +L008loop: + paddd %xmm3,%xmm0 + movdqa %xmm3,%xmm2 + pxor %xmm0,%xmm6 + pshufb (%eax),%xmm6 + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -48(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 80(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,64(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-64(%ebx) + paddd %xmm7,%xmm5 + movdqa 32(%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -32(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 96(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,80(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,16(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-48(%ebx) + paddd %xmm6,%xmm4 + movdqa 48(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -16(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 112(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,96(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-32(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa -48(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,%xmm6 + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + pshufb (%eax),%xmm6 + movdqa %xmm3,-16(%ebx) + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -32(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 64(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,112(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,32(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-48(%ebx) + paddd %xmm7,%xmm5 + movdqa (%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -16(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 80(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,64(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,48(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-32(%ebx) + paddd %xmm6,%xmm4 + movdqa 16(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -64(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 96(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,80(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-16(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 64(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,96(%ebx) + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + por %xmm1,%xmm3 + decl %edx + jnz L008loop + movdqa %xmm3,-64(%ebx) + movdqa %xmm4,(%ebx) + movdqa %xmm5,16(%ebx) + movdqa %xmm6,64(%ebx) + movdqa %xmm7,96(%ebx) + movdqa -112(%ebx),%xmm1 + movdqa -96(%ebx),%xmm2 + movdqa -80(%ebx),%xmm3 + paddd -128(%ebp),%xmm0 + paddd -112(%ebp),%xmm1 + paddd -96(%ebp),%xmm2 + paddd -80(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa -64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa -48(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa -32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa -16(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd -64(%ebp),%xmm0 + paddd -48(%ebp),%xmm1 + paddd -32(%ebp),%xmm2 + paddd -16(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa (%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 16(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 48(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd (%ebp),%xmm0 + paddd 16(%ebp),%xmm1 + paddd 32(%ebp),%xmm2 + paddd 48(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa 64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 80(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 96(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 112(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd 64(%ebp),%xmm0 + paddd 80(%ebp),%xmm1 + paddd 96(%ebp),%xmm2 + paddd 112(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 208(%esi),%esi + pxor %xmm0,%xmm4 + pxor %xmm1,%xmm5 + pxor %xmm2,%xmm6 + pxor %xmm3,%xmm7 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc L007outer_loop + addl $256,%ecx + jz L009done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + movd 64(%ebp),%xmm2 + movdqu (%ebx),%xmm3 + paddd 96(%eax),%xmm2 + pand 112(%eax),%xmm3 + por %xmm2,%xmm3 +L0061x: + movdqa 32(%eax),%xmm0 + movdqu (%edx),%xmm1 + movdqu 16(%edx),%xmm2 + movdqa (%eax),%xmm6 + movdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movl $10,%edx + jmp L010loop1x +.align 4,0x90 +L011outer1x: + movdqa 80(%eax),%xmm3 + movdqa (%esp),%xmm0 + movdqa 16(%esp),%xmm1 + movdqa 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + movl $10,%edx + movdqa %xmm3,48(%esp) + jmp L010loop1x +.align 4,0x90 +L010loop1x: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decl %edx + jnz L010loop1x + paddd (%esp),%xmm0 + paddd 16(%esp),%xmm1 + paddd 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + cmpl $64,%ecx + jb L012tail + movdqu (%esi),%xmm4 + movdqu 16(%esi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%esi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%esi),%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + leal 64(%esi),%esi + movdqu %xmm0,(%edi) + movdqu %xmm1,16(%edi) + movdqu %xmm2,32(%edi) + movdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz L011outer1x + jmp L009done +L012tail: + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +L013tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz L013tail_loop +L009done: + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +Lssse3_data: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.long 1634760805,857760878,2036477234,1797285236 +.long 0,1,2,3 +.long 4,4,4,4 +.long 1,0,0,0 +.long 4,0,0,0 +.long 0,-1,-1,-1 +.align 6,0x90 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +.byte 114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-linux.S index eb940347..73a49e6e 100644 --- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -961,7 +960,6 @@ ChaCha20_ctr32_ssse3: .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-apple.S index 6a51e976..0a1f1b48 100644 --- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -461,7 +460,6 @@ _ChaCha20_ctr32_ssse3_4x: _CET_ENDBR movq %rsp,%r9 - movq %r10,%r11 subq $0x140+8,%rsp movdqa L$sigma(%rip),%xmm11 movdqu (%rcx),%xmm15 @@ -1604,7 +1602,6 @@ L$8x_epilogue: #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-linux.S index 7a4759c5..dee0a994 100644 --- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -467,7 +466,6 @@ ChaCha20_ctr32_ssse3_4x: _CET_ENDBR movq %rsp,%r9 .cfi_def_cfa_register r9 - movq %r10,%r11 subq $0x140+8,%rsp movdqa .Lsigma(%rip),%xmm11 movdqu (%rcx),%xmm15 @@ -1610,7 +1608,6 @@ _CET_ENDBR .cfi_endproc .size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-ios.ios.aarch64.S rename to Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-apple.S index c39f8c08..ac2500d7 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-ios.ios.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -3009,7 +3008,6 @@ Lopen_128_hash_64: .cfi_endproc #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) -#endif // defined(__aarch64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-linux.linux.aarch64.S rename to Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-linux.S index 45387193..30ef5ce4 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-linux.linux.aarch64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__aarch64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -3009,7 +3008,6 @@ chacha20_poly1305_open: .cfi_endproc .size chacha20_poly1305_open,.-chacha20_poly1305_open #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) -#endif // defined(__aarch64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-win.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-win.S new file mode 100644 index 00000000..e8952e59 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-win.S @@ -0,0 +1,3020 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#include +.section .rodata + +.align 7 +Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +Linc: +.long 1,2,3,4 +Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC + +.text + +.def Lpoly_hash_ad_internal + .type 32 +.endef +.align 6 +Lpoly_hash_ad_internal: +.cfi_startproc + cbnz x4, Lpoly_hash_intro + ret + +Lpoly_hash_intro: + cmp x4, #16 + b.lt Lpoly_hash_ad_tail + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b Lpoly_hash_ad_internal + +Lpoly_hash_ad_tail: + cbz x4, Lpoly_hash_ad_ret + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD + sub x4, x4, #1 + +Lpoly_hash_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, x4] + mov v20.b[0], w11 + subs x4, x4, #1 + b.ge Lpoly_hash_tail_16_compose + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lpoly_hash_ad_ret: + ret +.cfi_endproc + + +///////////////////////////////// +// +// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); +// +.globl chacha20_poly1305_seal + +.def chacha20_poly1305_seal + .type 32 +.endef +.align 6 +chacha20_poly1305_seal: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + ldr x12, [x5, #56] // The total cipher text length includes extra_in_len + add x12, x12, x2 + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x12 + + cmp x2, #128 + b.le Lseal_128 // Optimization for smaller buffers + + // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, + // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, + // the fifth block (A4-D4) horizontally. + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + sub x5, x5, #32 + + mov x6, #10 + +.align 5 +Lseal_init_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.hi Lseal_init_rounds + + add v15.4s, v15.4s, v25.4s + mov x11, #4 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + and v4.16b, v4.16b, v27.16b + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + mov x16, v4.d[0] // Move the R key to GPRs + mov x17, v4.d[1] + mov v27.16b, v9.16b // Store the S key + + bl Lpoly_hash_ad_internal + + mov x3, x0 + cmp x2, #256 + b.le Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #256 + + mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds + mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 + +Lseal_main_loop: + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + sub x5, x5, #32 +.align 5 +Lseal_main_loop_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.ge Lseal_main_loop_rounds + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + subs x7, x7, #1 + b.gt Lseal_main_loop_rounds + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + cmp x2, #320 + b.le Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #320 + + mov x6, #0 + mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration + + b Lseal_main_loop + +Lseal_tail: + // This part of the function handles the storage and authentication of the last [0,320) bytes + // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. + cmp x2, #64 + b.lt Lseal_tail_64 + + // Store and authenticate 64B blocks per iteration + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + // Shift the state left by 64 bytes for the next iteration of the loop + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + + mov v1.16b, v2.16b + mov v6.16b, v7.16b + mov v11.16b, v12.16b + mov v16.16b, v17.16b + + mov v2.16b, v3.16b + mov v7.16b, v8.16b + mov v12.16b, v13.16b + mov v17.16b, v18.16b + + mov v3.16b, v4.16b + mov v8.16b, v9.16b + mov v13.16b, v14.16b + mov v18.16b, v19.16b + + b Lseal_tail + +Lseal_tail_64: + ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr + + // Here we handle the last [0,64) bytes of plaintext + cmp x2, #16 + b.lt Lseal_tail_16 + // Each iteration encrypt and authenticate a 16B block + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b}, [x0], #16 + + sub x2, x2, #16 + + // Shift the state left by 16 bytes for the next iteration of the loop + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + + b Lseal_tail_64 + +Lseal_tail_16: + // Here we handle the last [0,16) bytes of ciphertext that require a padded block + cbz x2, Lseal_hash_extra + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes + not v22.16b, v20.16b + + mov x6, x2 + add x1, x1, x2 + + cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding + + mov x7, #16 // We need to load some extra_in first for padding + sub x7, x7, x2 + cmp x4, x7 + csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register + mov x12, x7 + add x3, x3, x7 + sub x4, x4, x7 + +Lseal_tail16_compose_extra_in: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x7, x7, #1 + b.gt Lseal_tail16_compose_extra_in + + add x3, x3, x12 + +Lseal_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x1, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt Lseal_tail_16_compose + + and v0.16b, v0.16b, v21.16b + eor v20.16b, v20.16b, v0.16b + mov v21.16b, v20.16b + +Lseal_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt Lseal_tail_16_store + + // Hash in the final ct block concatenated with extra_in + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lseal_hash_extra: + cbz x4, Lseal_finalize + +Lseal_hash_extra_loop: + cmp x4, #16 + b.lt Lseal_hash_extra_tail + ld1 {v20.16b}, [x3], #16 + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b Lseal_hash_extra_loop + +Lseal_hash_extra_tail: + cbz x4, Lseal_finalize + eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext + add x3, x3, x4 + +Lseal_hash_extra_load: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x4, x4, #1 + b.gt Lseal_hash_extra_load + + // Hash in the final padded extra_in blcok + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lseal_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Lseal_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +Lseal_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi Lseal_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + // Only the first 32 bytes of the third block (counter = 0) are needed, + // so skip updating v12 and v17. + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl Lpoly_hash_ad_internal + b Lseal_tail +.cfi_endproc + + +///////////////////////////////// +// +// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); +// +.globl chacha20_poly1305_open + +.def chacha20_poly1305_open + .type 32 +.endef +.align 6 +chacha20_poly1305_open: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x2 + + cmp x2, #128 + b.le Lopen_128 // Optimization for smaller buffers + + // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + + mov x6, #10 + +.align 5 +Lopen_init_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.hi Lopen_init_rounds + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + + and v0.16b, v0.16b, v27.16b + mov x16, v0.d[0] // Move the R key to GPRs + mov x17, v0.d[1] + mov v27.16b, v5.16b // Store the S key + + bl Lpoly_hash_ad_internal + +Lopen_ad_done: + mov x3, x1 + +// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes +Lopen_main_loop: + + cmp x2, #192 + b.lt Lopen_tail + + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + sub x5, x5, #32 + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 + sub x4, x4, #10 + + mov x7, #10 + subs x6, x7, x4 + subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full + + cbz x7, Lopen_main_loop_rounds_short + +.align 5 +Lopen_main_loop_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +Lopen_main_loop_rounds_short: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x7, x7, #1 + b.gt Lopen_main_loop_rounds + subs x6, x6, #1 + b.ge Lopen_main_loop_rounds_short + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + // We can always safely store 192 bytes + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #192 + + mov v0.16b, v3.16b + mov v5.16b, v8.16b + mov v10.16b, v13.16b + mov v15.16b, v18.16b + + cmp x2, #64 + b.lt Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v4.16b + mov v5.16b, v9.16b + mov v10.16b, v14.16b + mov v15.16b, v19.16b + + cmp x2, #64 + b.lt Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + b Lopen_main_loop + +Lopen_tail: + + cbz x2, Lopen_finalize + + lsr x4, x2, #4 // How many whole blocks we have to hash + + cmp x2, #64 + b.le Lopen_tail_64 + cmp x2, #128 + b.le Lopen_tail_128 + +Lopen_tail_192: + // We need three more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + mov v17.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v21.16b, v21.16b, v21.16b + ins v23.s[0], v25.s[0] + ins v21.d[0], x15 + + add v22.4s, v23.4s, v21.4s + add v21.4s, v22.4s, v21.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + mov x7, #10 + subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing + sub x4, x4, x7 + + cbz x7, Lopen_tail_192_rounds_no_hash + +Lopen_tail_192_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +Lopen_tail_192_rounds_no_hash: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x7, x7, #1 + b.gt Lopen_tail_192_rounds + subs x6, x6, #1 + b.ge Lopen_tail_192_rounds_no_hash + + // We hashed 160 bytes at most, may still have 32 bytes left +Lopen_tail_192_hash: + cbz x4, Lopen_tail_192_hash_done + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b Lopen_tail_192_hash + +Lopen_tail_192_hash_done: + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v12.4s, v12.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #128 + b Lopen_tail_64_store + +Lopen_tail_128: + // We need two more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v22.16b, v22.16b, v22.16b + ins v23.s[0], v25.s[0] + ins v22.d[0], x15 + add v22.4s, v22.4s, v23.4s + + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +Lopen_tail_128_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #4 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #12 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #4 + subs x6, x6, #1 + b.gt Lopen_tail_128_rounds + cbz x4, Lopen_tail_128_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b Lopen_tail_128_rounds + +Lopen_tail_128_rounds_done: + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + b Lopen_tail_64_store + +Lopen_tail_64: + // We just need a single block + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + ins v23.s[0], v25.s[0] + add v15.4s, v15.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +Lopen_tail_64_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.gt Lopen_tail_64_rounds + cbz x4, Lopen_tail_64_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b Lopen_tail_64_rounds + +Lopen_tail_64_rounds_done: + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v15.4s, v15.4s, v23.4s + +Lopen_tail_64_store: + cmp x2, #16 + b.lt Lopen_tail_16 + + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + st1 {v20.16b}, [x0], #16 + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + sub x2, x2, #16 + b Lopen_tail_64_store + +Lopen_tail_16: + // Here we handle the last [0,16) bytes that require a padded block + cbz x2, Lopen_finalize + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask + not v22.16b, v20.16b + + add x7, x1, x2 + mov x6, x2 + +Lopen_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x7, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt Lopen_tail_16_compose + + and v20.16b, v20.16b, v21.16b + // Hash in the final padded block + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + eor v20.16b, v20.16b, v0.16b + +Lopen_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt Lopen_tail_16_store + +Lopen_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Lopen_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +Lopen_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi Lopen_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl Lpoly_hash_ad_internal + +Lopen_128_store: + cmp x2, #64 + b.lt Lopen_128_store_64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + +Lopen_128_store_64: + + lsr x4, x2, #4 + mov x3, x1 + +Lopen_128_hash_64: + cbz x4, Lopen_tail_64_store + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b Lopen_128_hash_64 +.cfi_endproc + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-apple.S index b4ff5f05..7ca074e7 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-apple.S @@ -1,18 +1,13 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) -.text - - -chacha20_poly1305_constants: - .section __DATA,__const .p2align 6 +chacha20_poly1305_constants: L$chacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' @@ -218,11 +213,11 @@ L$hash_ad_done: -.globl _chacha20_poly1305_open -.private_extern _chacha20_poly1305_open +.globl _chacha20_poly1305_open_nohw +.private_extern _chacha20_poly1305_open_nohw .p2align 6 -_chacha20_poly1305_open: +_chacha20_poly1305_open_nohw: _CET_ENDBR pushq %rbp @@ -251,11 +246,6 @@ _CET_ENDBR movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) - movl _OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_open_avx2 - cmpq $128,%rbx jbe L$open_sse_128 @@ -2090,11 +2080,11 @@ L$open_sse_128_xor_hash: -.globl _chacha20_poly1305_seal -.private_extern _chacha20_poly1305_seal +.globl _chacha20_poly1305_seal_nohw +.private_extern _chacha20_poly1305_seal_nohw .p2align 6 -_chacha20_poly1305_seal: +_chacha20_poly1305_seal_nohw: _CET_ENDBR pushq %rbp @@ -2124,11 +2114,6 @@ _CET_ENDBR movq %rbx,8+0+32(%rbp) movq %rdx,%rbx - movl _OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_seal_avx2 - cmpq $128,%rbx jbe L$seal_sse_128 @@ -4077,20 +4062,38 @@ L$seal_sse_128_rounds: +.globl _chacha20_poly1305_open_avx2 +.private_extern _chacha20_poly1305_open_avx2 .p2align 6 -chacha20_poly1305_open_avx2: +_chacha20_poly1305_open_avx2: +_CET_ENDBR + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + + pushq %r15 + pushq %r9 + subq $288 + 0 + 32,%rsp + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) vzeroupper vmovdqa L$chacha20_consts(%rip),%ymm0 @@ -6225,20 +6228,39 @@ L$open_avx2_320_rounds: +.globl _chacha20_poly1305_seal_avx2 +.private_extern _chacha20_poly1305_seal_avx2 .p2align 6 -chacha20_poly1305_seal_avx2: +_chacha20_poly1305_seal_avx2: + +_CET_ENDBR + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %r9 + subq $288 + 0 + 32,%rsp + leaq 32(%rsp),%rbp + andq $-32,%rbp + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx vzeroupper vmovdqa L$chacha20_consts(%rip),%ymm0 @@ -8875,7 +8897,6 @@ L$seal_avx2_exit: #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-linux.S index 3f3c641d..d80e89dd 100644 --- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-linux.S @@ -1,19 +1,13 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) -.text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -chacha20_poly1305_constants: - .section .rodata .align 64 +chacha20_poly1305_constants: .Lchacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' @@ -219,11 +213,11 @@ poly_hash_ad_internal: .cfi_endproc .size poly_hash_ad_internal, .-poly_hash_ad_internal -.globl chacha20_poly1305_open -.hidden chacha20_poly1305_open -.type chacha20_poly1305_open,@function +.globl chacha20_poly1305_open_nohw +.hidden chacha20_poly1305_open_nohw +.type chacha20_poly1305_open_nohw,@function .align 64 -chacha20_poly1305_open: +chacha20_poly1305_open_nohw: .cfi_startproc _CET_ENDBR pushq %rbp @@ -259,11 +253,6 @@ _CET_ENDBR movq %r8,0+0+32(%rbp) movq %rbx,8+0+32(%rbp) - movl OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_open_avx2 - cmpq $128,%rbx jbe .Lopen_sse_128 @@ -2096,7 +2085,7 @@ _CET_ENDBR movdqa %xmm10,%xmm6 movdqa %xmm14,%xmm10 jmp .Lopen_sse_128_xor_hash -.size chacha20_poly1305_open, .-chacha20_poly1305_open +.size chacha20_poly1305_open_nohw, .-chacha20_poly1305_open_nohw .cfi_endproc @@ -2105,11 +2094,11 @@ _CET_ENDBR -.globl chacha20_poly1305_seal -.hidden chacha20_poly1305_seal -.type chacha20_poly1305_seal,@function +.globl chacha20_poly1305_seal_nohw +.hidden chacha20_poly1305_seal_nohw +.type chacha20_poly1305_seal_nohw,@function .align 64 -chacha20_poly1305_seal: +chacha20_poly1305_seal_nohw: .cfi_startproc _CET_ENDBR pushq %rbp @@ -2146,11 +2135,6 @@ _CET_ENDBR movq %rbx,8+0+32(%rbp) movq %rdx,%rbx - movl OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_seal_avx2 - cmpq $128,%rbx jbe .Lseal_sse_128 @@ -4102,32 +4086,50 @@ process_extra_in_trailer: movq %r8,%r8 call poly_hash_ad_internal jmp .Lseal_sse_128_tail_xor -.size chacha20_poly1305_seal, .-chacha20_poly1305_seal +.size chacha20_poly1305_seal_nohw, .-chacha20_poly1305_seal_nohw .cfi_endproc +.globl chacha20_poly1305_open_avx2 +.hidden chacha20_poly1305_open_avx2 .type chacha20_poly1305_open_avx2,@function .align 64 chacha20_poly1305_open_avx2: .cfi_startproc - - +_CET_ENDBR + pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 + pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 + pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 + pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 + pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 + pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 + + + pushq %r9 .cfi_adjust_cfa_offset 8 .cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp .cfi_adjust_cfa_offset 288 + 32 + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + vzeroupper vmovdqa .Lchacha20_consts(%rip),%ymm0 vbroadcasti128 0(%r9),%ymm4 @@ -6261,27 +6263,46 @@ chacha20_poly1305_open_avx2: .cfi_endproc +.globl chacha20_poly1305_seal_avx2 +.hidden chacha20_poly1305_seal_avx2 .type chacha20_poly1305_seal_avx2,@function .align 64 chacha20_poly1305_seal_avx2: .cfi_startproc - - +_CET_ENDBR + pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 + pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 + pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 + pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 + pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 + pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 + + + pushq %r9 .cfi_adjust_cfa_offset 8 .cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp .cfi_adjust_cfa_offset 288 + 32 + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx vzeroupper vmovdqa .Lchacha20_consts(%rip),%ymm0 @@ -8918,7 +8939,6 @@ chacha20_poly1305_seal_avx2: .cfi_endproc .size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2 #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/err/err_data.c b/Sources/CCryptoBoringSSL/gen/crypto/err_data.c similarity index 78% rename from Sources/CCryptoBoringSSL/crypto/err/err_data.c rename to Sources/CCryptoBoringSSL/gen/crypto/err_data.c index 1068c5f6..d7612384 100644 --- a/Sources/CCryptoBoringSSL/crypto/err/err_data.c +++ b/Sources/CCryptoBoringSSL/gen/crypto/err_data.c @@ -12,7 +12,7 @@ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - /* This file was generated by err_data_generate.go. */ + /* This file was generated by go run ./util/pregenerate. */ #include #include @@ -76,54 +76,54 @@ const uint32_t kOpenSSLReasonValues[] = { 0xc3b00f7, 0xc3b8921, 0x10320892, - 0x10329641, - 0x1033164d, - 0x10339666, - 0x10341679, + 0x10329654, + 0x10331660, + 0x10339679, + 0x1034168c, 0x10348f93, 0x10350cdf, - 0x1035968c, - 0x103616b6, - 0x103696c9, - 0x103716e8, - 0x10379701, - 0x10381716, - 0x10389734, - 0x10391743, - 0x1039975f, - 0x103a177a, - 0x103a9789, - 0x103b17a5, - 0x103b97c0, - 0x103c17e6, + 0x1035969f, + 0x103616c9, + 0x103696dc, + 0x103716fb, + 0x10379714, + 0x10381729, + 0x10389747, + 0x10391756, + 0x10399772, + 0x103a178d, + 0x103a979c, + 0x103b17b8, + 0x103b97d3, + 0x103c17f9, 0x103c80f7, - 0x103d17f7, - 0x103d980b, - 0x103e182a, - 0x103e9839, - 0x103f1850, - 0x103f9863, + 0x103d180a, + 0x103d981e, + 0x103e183d, + 0x103e984c, + 0x103f1863, + 0x103f9876, 0x10400ca3, - 0x10409876, - 0x10411894, - 0x104198a7, - 0x104218c1, - 0x104298d1, - 0x104318e5, - 0x104398fb, - 0x10441913, - 0x10449928, - 0x1045193c, - 0x1045994e, + 0x10409889, + 0x104118a7, + 0x104198ba, + 0x104218d4, + 0x104298e4, + 0x104318f8, + 0x1043990e, + 0x10441926, + 0x1044993b, + 0x1045194f, + 0x10459961, 0x10460635, 0x1046899a, - 0x10471963, - 0x1047997a, - 0x1048198f, - 0x1048999d, + 0x10471976, + 0x1047998d, + 0x104819a2, + 0x104899b0, 0x10490edf, - 0x104997d7, - 0x104a16a1, + 0x104997ea, + 0x104a16b4, 0x14320c73, 0x14328c94, 0x14330ca3, @@ -139,53 +139,54 @@ const uint32_t kOpenSSLReasonValues[] = { 0x183480f7, 0x18351032, 0x1835904a, - 0x1836105f, - 0x18369073, - 0x183710ab, - 0x183790c1, - 0x183810d5, - 0x183890e5, + 0x18361072, + 0x18369086, + 0x183710be, + 0x183790d4, + 0x183810e8, + 0x183890f8, 0x18390ac0, - 0x183990f5, - 0x183a111b, - 0x183a9141, + 0x18399108, + 0x183a112e, + 0x183a9154, 0x183b0ceb, - 0x183b9190, - 0x183c11a2, - 0x183c91ad, - 0x183d11bd, - 0x183d91ce, - 0x183e11df, - 0x183e91f1, - 0x183f121a, - 0x183f9233, - 0x1840124b, + 0x183b91a3, + 0x183c11b5, + 0x183c91c0, + 0x183d11d0, + 0x183d91e1, + 0x183e11f2, + 0x183e9204, + 0x183f122d, + 0x183f9246, + 0x1840125e, 0x1840870d, - 0x18411164, - 0x1841912f, - 0x1842114e, + 0x18411177, + 0x18419142, + 0x18421161, 0x18428c81, - 0x1843110a, - 0x18439176, + 0x1843111d, + 0x18439189, 0x18441028, - 0x18449097, - 0x20321285, - 0x20329272, - 0x24321291, + 0x184490aa, + 0x1845105f, + 0x20321298, + 0x20329285, + 0x243212a4, 0x243289e0, - 0x243312a3, - 0x243392b0, - 0x243412bd, - 0x243492cf, - 0x243512de, - 0x243592fb, - 0x24361308, - 0x24369316, - 0x24371324, - 0x24379332, - 0x2438133b, - 0x24389348, - 0x2439135b, + 0x243312b6, + 0x243392c3, + 0x243412d0, + 0x243492e2, + 0x243512f1, + 0x2435930e, + 0x2436131b, + 0x24369329, + 0x24371337, + 0x24379345, + 0x2438134e, + 0x2438935b, + 0x2439136e, 0x28320cd3, 0x28328ceb, 0x28330ca3, @@ -195,51 +196,51 @@ const uint32_t kOpenSSLReasonValues[] = { 0x283500f7, 0x28358c81, 0x2836099a, - 0x2c3232e7, - 0x2c329372, - 0x2c3332f5, - 0x2c33b307, - 0x2c34331b, - 0x2c34b32d, - 0x2c353348, - 0x2c35b35a, - 0x2c36338a, + 0x2c3232fa, + 0x2c329385, + 0x2c333308, + 0x2c33b31a, + 0x2c34332e, + 0x2c34b340, + 0x2c35335b, + 0x2c35b36d, + 0x2c36339d, 0x2c36833a, - 0x2c373397, - 0x2c37b3c3, - 0x2c383401, - 0x2c38b418, - 0x2c393436, - 0x2c39b446, - 0x2c3a3458, - 0x2c3ab46c, - 0x2c3b347d, - 0x2c3bb49c, - 0x2c3c1384, - 0x2c3c939a, - 0x2c3d34e1, - 0x2c3d93b3, - 0x2c3e350b, - 0x2c3eb519, - 0x2c3f3531, - 0x2c3fb549, - 0x2c403573, - 0x2c409285, - 0x2c413584, - 0x2c41b597, - 0x2c42124b, - 0x2c42b5a8, + 0x2c3733aa, + 0x2c37b3d6, + 0x2c383414, + 0x2c38b42b, + 0x2c393449, + 0x2c39b459, + 0x2c3a346b, + 0x2c3ab47f, + 0x2c3b3490, + 0x2c3bb4af, + 0x2c3c1397, + 0x2c3c93ad, + 0x2c3d34f4, + 0x2c3d93c6, + 0x2c3e351e, + 0x2c3eb52c, + 0x2c3f3544, + 0x2c3fb55c, + 0x2c403586, + 0x2c409298, + 0x2c413597, + 0x2c41b5aa, + 0x2c42125e, + 0x2c42b5bb, 0x2c43076d, - 0x2c43b48e, - 0x2c4433d6, - 0x2c44b556, - 0x2c45336d, - 0x2c45b3a9, - 0x2c463426, - 0x2c46b4b0, - 0x2c4734c5, - 0x2c47b4fe, - 0x2c4833e8, + 0x2c43b4a1, + 0x2c4433e9, + 0x2c44b569, + 0x2c453380, + 0x2c45b3bc, + 0x2c463439, + 0x2c46b4c3, + 0x2c4734d8, + 0x2c47b511, + 0x2c4833fb, 0x30320000, 0x30328015, 0x3033001f, @@ -379,261 +380,261 @@ const uint32_t kOpenSSLReasonValues[] = { 0x3c418dd3, 0x3c420edf, 0x3c428e69, - 0x40321a2f, - 0x40329a45, - 0x40331a73, - 0x40339a7d, - 0x40341a94, - 0x40349ab2, - 0x40351ac2, - 0x40359ad4, - 0x40361ae1, - 0x40369aed, - 0x40371b02, - 0x40379b14, - 0x40381b1f, - 0x40389b31, + 0x40321a42, + 0x40329a58, + 0x40331a86, + 0x40339a90, + 0x40341aa7, + 0x40349ac5, + 0x40351ad5, + 0x40359ae7, + 0x40361af4, + 0x40369b00, + 0x40371b15, + 0x40379b27, + 0x40381b32, + 0x40389b44, 0x40390f93, - 0x40399b41, - 0x403a1b54, - 0x403a9b75, - 0x403b1b86, - 0x403b9b96, + 0x40399b54, + 0x403a1b67, + 0x403a9b88, + 0x403b1b99, + 0x403b9ba9, 0x403c0071, 0x403c8090, - 0x403d1bf7, - 0x403d9c0d, - 0x403e1c1c, - 0x403e9c54, - 0x403f1c6e, - 0x403f9c96, - 0x40401cab, - 0x40409cbf, - 0x40411cfa, - 0x40419d15, - 0x40421d2e, - 0x40429d41, - 0x40431d55, - 0x40439d83, - 0x40441d9a, + 0x403d1c0a, + 0x403d9c20, + 0x403e1c2f, + 0x403e9c67, + 0x403f1c81, + 0x403f9ca9, + 0x40401cbe, + 0x40409cd2, + 0x40411d0d, + 0x40419d28, + 0x40421d41, + 0x40429d54, + 0x40431d68, + 0x40439d96, + 0x40441dad, 0x404480b9, - 0x40451daf, - 0x40459dc1, - 0x40461de5, - 0x40469e05, - 0x40471e13, - 0x40479e3a, - 0x40481eab, - 0x40489f65, - 0x40491f7c, - 0x40499f96, - 0x404a1fad, - 0x404a9fcb, - 0x404b1fe3, - 0x404ba010, - 0x404c2026, - 0x404ca038, - 0x404d2059, - 0x404da092, - 0x404e20a6, - 0x404ea0b3, - 0x404f2164, - 0x404fa1da, - 0x40502249, - 0x4050a25d, - 0x40512290, - 0x405222a0, - 0x4052a2c4, - 0x405322dc, - 0x4053a2ef, - 0x40542304, - 0x4054a327, - 0x40552352, - 0x4055a38f, - 0x405623b4, - 0x4056a3cd, - 0x405723e5, - 0x4057a3f8, - 0x4058240d, - 0x4058a434, - 0x40592463, - 0x4059a490, - 0x405aa4a4, - 0x405b24bc, - 0x405ba4cd, - 0x405c24e0, - 0x405ca51f, - 0x405d252c, - 0x405da551, - 0x405e258f, + 0x40451dc2, + 0x40459dd4, + 0x40461df8, + 0x40469e18, + 0x40471e26, + 0x40479e4d, + 0x40481ebe, + 0x40489f78, + 0x40491f8f, + 0x40499fa9, + 0x404a1fc0, + 0x404a9fde, + 0x404b1ff6, + 0x404ba023, + 0x404c2039, + 0x404ca04b, + 0x404d206c, + 0x404da0a5, + 0x404e20b9, + 0x404ea0c6, + 0x404f2177, + 0x404fa1ed, + 0x4050225c, + 0x4050a270, + 0x405122a3, + 0x405222b3, + 0x4052a2d7, + 0x405322ef, + 0x4053a302, + 0x40542317, + 0x4054a33a, + 0x40552365, + 0x4055a3a2, + 0x405623c7, + 0x4056a3e0, + 0x405723f8, + 0x4057a40b, + 0x40582420, + 0x4058a447, + 0x40592476, + 0x4059a4a3, + 0x405aa4b7, + 0x405b24cf, + 0x405ba4e0, + 0x405c24f3, + 0x405ca532, + 0x405d253f, + 0x405da564, + 0x405e25a2, 0x405e8afe, - 0x405f25b0, - 0x405fa5bd, - 0x406025cb, - 0x4060a5ed, - 0x4061264e, - 0x4061a686, - 0x4062269d, - 0x4062a6ae, - 0x406326fb, - 0x4063a710, - 0x40642727, - 0x4064a753, - 0x4065276e, - 0x4065a785, - 0x4066279d, - 0x4066a7c7, - 0x406727f2, - 0x4067a837, - 0x4068287f, - 0x4068a8a0, - 0x406928d2, - 0x4069a900, - 0x406a2921, - 0x406aa941, - 0x406b2ac9, - 0x406baaec, - 0x406c2b02, - 0x406cae0c, - 0x406d2e3b, - 0x406dae63, - 0x406e2e91, - 0x406eaede, - 0x406f2f37, - 0x406faf6f, - 0x40702f82, - 0x4070af9f, + 0x405f25c3, + 0x405fa5d0, + 0x406025de, + 0x4060a600, + 0x40612661, + 0x4061a699, + 0x406226b0, + 0x4062a6c1, + 0x4063270e, + 0x4063a723, + 0x4064273a, + 0x4064a766, + 0x40652781, + 0x4065a798, + 0x406627b0, + 0x4066a7da, + 0x40672805, + 0x4067a84a, + 0x40682892, + 0x4068a8b3, + 0x406928e5, + 0x4069a913, + 0x406a2934, + 0x406aa954, + 0x406b2adc, + 0x406baaff, + 0x406c2b15, + 0x406cae1f, + 0x406d2e4e, + 0x406dae76, + 0x406e2ea4, + 0x406eaef1, + 0x406f2f4a, + 0x406faf82, + 0x40702f95, + 0x4070afb2, 0x4071084d, - 0x4071afb1, - 0x40722fc4, - 0x4072affa, - 0x40733012, - 0x4073959c, - 0x40743026, - 0x4074b040, - 0x40753051, - 0x4075b065, - 0x40763073, - 0x40769348, - 0x40773098, - 0x4077b0d8, - 0x407830f3, - 0x4078b12c, - 0x40793143, - 0x4079b159, - 0x407a3185, - 0x407ab198, - 0x407b31ad, - 0x407bb1bf, - 0x407c31f0, - 0x407cb1f9, - 0x407d28bb, - 0x407da202, - 0x407e3108, - 0x407ea444, - 0x407f1e27, - 0x407f9ffa, - 0x40802174, - 0x40809e4f, - 0x408122b2, - 0x4081a101, - 0x40822e7c, - 0x40829ba2, - 0x4083241f, - 0x4083a738, - 0x40841e63, - 0x4084a47c, - 0x408524f1, - 0x4085a615, - 0x40862571, - 0x4086a21c, - 0x40872ec2, - 0x4087a663, - 0x40881be0, - 0x4088a84a, - 0x40891c2f, - 0x40899bbc, - 0x408a2b3a, - 0x408a99b4, - 0x408b31d4, - 0x408baf4c, - 0x408c2501, - 0x408c99ec, - 0x408d1f4b, - 0x408d9e95, - 0x408e207b, - 0x408ea36f, - 0x408f285e, - 0x408fa631, - 0x40902813, - 0x4090a543, - 0x40912b22, - 0x40919a12, - 0x40921c7c, - 0x4092aefd, - 0x40932fdd, - 0x4093a22d, - 0x40941e77, - 0x4094ab53, - 0x409526bf, - 0x4095b165, - 0x40962ea9, - 0x4096a18d, - 0x40972278, - 0x4097a0ca, - 0x40981cdc, - 0x4098a6d3, - 0x40992f19, - 0x4099a39c, - 0x409a2335, - 0x409a99d0, - 0x409b1ed1, - 0x409b9efc, - 0x409c30ba, - 0x409c9f24, - 0x409d2149, - 0x409da117, - 0x409e1d6d, - 0x409ea1c2, - 0x409f21aa, - 0x409f9ec4, - 0x40a021ea, - 0x40a0a0e4, - 0x40a12132, - 0x41f429f4, - 0x41f92a86, - 0x41fe2979, - 0x41feac2f, - 0x41ff2d5d, - 0x42032a0d, - 0x42082a2f, - 0x4208aa6b, - 0x4209295d, - 0x4209aaa5, - 0x420a29b4, - 0x420aa994, - 0x420b29d4, - 0x420baa4d, - 0x420c2d79, - 0x420cab63, - 0x420d2c16, - 0x420dac4d, - 0x42122c80, - 0x42172d40, - 0x4217acc2, - 0x421c2ce4, - 0x421f2c9f, - 0x42212df1, - 0x42262d23, - 0x422b2dcf, - 0x422babf1, - 0x422c2db1, - 0x422caba4, - 0x422d2b7d, - 0x422dad90, - 0x422e2bd0, - 0x42302cff, - 0x4230ac67, + 0x4071afc4, + 0x40722fd7, + 0x4072b00d, + 0x40733025, + 0x407395af, + 0x40743039, + 0x4074b053, + 0x40753064, + 0x4075b078, + 0x40763086, + 0x4076935b, + 0x407730ab, + 0x4077b0eb, + 0x40783106, + 0x4078b13f, + 0x40793156, + 0x4079b16c, + 0x407a3198, + 0x407ab1ab, + 0x407b31c0, + 0x407bb1d2, + 0x407c3203, + 0x407cb20c, + 0x407d28ce, + 0x407da215, + 0x407e311b, + 0x407ea457, + 0x407f1e3a, + 0x407fa00d, + 0x40802187, + 0x40809e62, + 0x408122c5, + 0x4081a114, + 0x40822e8f, + 0x40829bb5, + 0x40832432, + 0x4083a74b, + 0x40841e76, + 0x4084a48f, + 0x40852504, + 0x4085a628, + 0x40862584, + 0x4086a22f, + 0x40872ed5, + 0x4087a676, + 0x40881bf3, + 0x4088a85d, + 0x40891c42, + 0x40899bcf, + 0x408a2b4d, + 0x408a99c7, + 0x408b31e7, + 0x408baf5f, + 0x408c2514, + 0x408c99ff, + 0x408d1f5e, + 0x408d9ea8, + 0x408e208e, + 0x408ea382, + 0x408f2871, + 0x408fa644, + 0x40902826, + 0x4090a556, + 0x40912b35, + 0x40919a25, + 0x40921c8f, + 0x4092af10, + 0x40932ff0, + 0x4093a240, + 0x40941e8a, + 0x4094ab66, + 0x409526d2, + 0x4095b178, + 0x40962ebc, + 0x4096a1a0, + 0x4097228b, + 0x4097a0dd, + 0x40981cef, + 0x4098a6e6, + 0x40992f2c, + 0x4099a3af, + 0x409a2348, + 0x409a99e3, + 0x409b1ee4, + 0x409b9f0f, + 0x409c30cd, + 0x409c9f37, + 0x409d215c, + 0x409da12a, + 0x409e1d80, + 0x409ea1d5, + 0x409f21bd, + 0x409f9ed7, + 0x40a021fd, + 0x40a0a0f7, + 0x40a12145, + 0x41f42a07, + 0x41f92a99, + 0x41fe298c, + 0x41feac42, + 0x41ff2d70, + 0x42032a20, + 0x42082a42, + 0x4208aa7e, + 0x42092970, + 0x4209aab8, + 0x420a29c7, + 0x420aa9a7, + 0x420b29e7, + 0x420baa60, + 0x420c2d8c, + 0x420cab76, + 0x420d2c29, + 0x420dac60, + 0x42122c93, + 0x42172d53, + 0x4217acd5, + 0x421c2cf7, + 0x421f2cb2, + 0x42212e04, + 0x42262d36, + 0x422b2de2, + 0x422bac04, + 0x422c2dc4, + 0x422cabb7, + 0x422d2b90, + 0x422dada3, + 0x422e2be3, + 0x42302d12, + 0x4230ac7a, 0x44320778, 0x44328787, 0x44330793, @@ -651,109 +652,109 @@ const uint32_t kOpenSSLReasonValues[] = { 0x4439084d, 0x4439885b, 0x443a086e, - 0x48321372, - 0x48329384, - 0x4833139a, - 0x483393b3, - 0x4c3213f0, - 0x4c329400, - 0x4c331413, - 0x4c339433, + 0x48321385, + 0x48329397, + 0x483313ad, + 0x483393c6, + 0x4c321403, + 0x4c329413, + 0x4c331426, + 0x4c339446, 0x4c3400b9, 0x4c3480f7, - 0x4c35143f, - 0x4c35944d, - 0x4c361469, - 0x4c36948f, - 0x4c37149e, - 0x4c3794ac, - 0x4c3814c1, - 0x4c3894cd, - 0x4c3914ed, - 0x4c399517, - 0x4c3a1530, - 0x4c3a9549, + 0x4c351452, + 0x4c359460, + 0x4c36147c, + 0x4c3694a2, + 0x4c3714b1, + 0x4c3794bf, + 0x4c3814d4, + 0x4c3894e0, + 0x4c391500, + 0x4c39952a, + 0x4c3a1543, + 0x4c3a955c, 0x4c3b0635, - 0x4c3b9562, - 0x4c3c1574, - 0x4c3c9583, - 0x4c3d159c, + 0x4c3b9575, + 0x4c3c1587, + 0x4c3c9596, + 0x4c3d15af, 0x4c3d8cc6, - 0x4c3e1609, - 0x4c3e95ab, - 0x4c3f162b, - 0x4c3f9348, - 0x4c4015c1, - 0x4c4093dc, - 0x4c4115f9, - 0x4c41947c, - 0x4c4215e5, - 0x4c4293c4, - 0x503235ba, - 0x5032b5c9, - 0x503335d4, - 0x5033b5e4, - 0x503435fd, - 0x5034b617, - 0x50353625, - 0x5035b63b, - 0x5036364d, - 0x5036b663, - 0x5037367c, - 0x5037b68f, - 0x503836a7, - 0x5038b6b8, - 0x503936cd, - 0x5039b6e1, - 0x503a3701, - 0x503ab717, - 0x503b372f, - 0x503bb741, - 0x503c375d, - 0x503cb774, - 0x503d378d, - 0x503db7a3, - 0x503e37b0, - 0x503eb7c6, - 0x503f37d8, + 0x4c3e161c, + 0x4c3e95be, + 0x4c3f163e, + 0x4c3f935b, + 0x4c4015d4, + 0x4c4093ef, + 0x4c41160c, + 0x4c41948f, + 0x4c4215f8, + 0x4c4293d7, + 0x503235cd, + 0x5032b5dc, + 0x503335e7, + 0x5033b5f7, + 0x50343610, + 0x5034b62a, + 0x50353638, + 0x5035b64e, + 0x50363660, + 0x5036b676, + 0x5037368f, + 0x5037b6a2, + 0x503836ba, + 0x5038b6cb, + 0x503936e0, + 0x5039b6f4, + 0x503a3714, + 0x503ab72a, + 0x503b3742, + 0x503bb754, + 0x503c3770, + 0x503cb787, + 0x503d37a0, + 0x503db7b6, + 0x503e37c3, + 0x503eb7d9, + 0x503f37eb, 0x503f83b3, - 0x504037eb, - 0x5040b7fb, - 0x50413815, - 0x5041b824, - 0x5042383e, - 0x5042b85b, - 0x5043386b, - 0x5043b87b, - 0x50443898, + 0x504037fe, + 0x5040b80e, + 0x50413828, + 0x5041b837, + 0x50423851, + 0x5042b86e, + 0x5043387e, + 0x5043b88e, + 0x504438ab, 0x50448469, - 0x504538ac, - 0x5045b8ca, - 0x504638dd, - 0x5046b8f3, - 0x50473905, - 0x5047b91a, - 0x50483940, - 0x5048b94e, - 0x50493961, - 0x5049b976, - 0x504a398c, - 0x504ab99c, - 0x504b39bc, - 0x504bb9cf, - 0x504c39f2, - 0x504cba20, - 0x504d3a4d, - 0x504dba6a, - 0x504e3a85, - 0x504ebaa1, - 0x504f3ab3, - 0x504fbaca, - 0x50503ad9, + 0x504538bf, + 0x5045b8dd, + 0x504638f0, + 0x5046b906, + 0x50473918, + 0x5047b92d, + 0x50483953, + 0x5048b961, + 0x50493974, + 0x5049b989, + 0x504a399f, + 0x504ab9af, + 0x504b39cf, + 0x504bb9e2, + 0x504c3a05, + 0x504cba33, + 0x504d3a60, + 0x504dba7d, + 0x504e3a98, + 0x504ebab4, + 0x504f3ac6, + 0x504fbadd, + 0x50503aec, 0x50508729, - 0x50513aec, - 0x5051b88a, - 0x50523a32, + 0x50513aff, + 0x5051b89d, + 0x50523a45, 0x58320fd1, 0x68320f93, 0x68328ceb, @@ -795,22 +796,22 @@ const uint32_t kOpenSSLReasonValues[] = { 0x783d8b97, 0x783e0aed, 0x783e8a9f, - 0x7c321261, - 0x8032148f, + 0x7c321274, + 0x803214a2, 0x80328090, - 0x803332b6, + 0x803332c9, 0x803380b9, - 0x803432c5, - 0x8034b22d, - 0x8035324b, - 0x8035b2d9, - 0x8036328d, - 0x8036b23c, - 0x8037327f, - 0x8037b21a, - 0x803832a0, - 0x8038b25c, - 0x80393271, + 0x803432d8, + 0x8034b240, + 0x8035325e, + 0x8035b2ec, + 0x803632a0, + 0x8036b24f, + 0x80373292, + 0x8037b22d, + 0x803832b3, + 0x8038b26f, + 0x80393284, }; const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]); @@ -1034,6 +1035,7 @@ const char kOpenSSLReasonStringData[] = "EMPTY_PSK\0" "EXPECTING_AN_EC_KEY_KEY\0" "EXPECTING_AN_RSA_KEY\0" + "EXPECTING_A_DH_KEY\0" "EXPECTING_A_DSA_KEY\0" "ILLEGAL_OR_UNSUPPORTED_PADDING_MODE\0" "INVALID_BUFFER_SIZE\0" diff --git a/Sources/CCryptoBoringSSL/gen/crypto/md5-586-apple.S b/Sources/CCryptoBoringSSL/gen/crypto/md5-586-apple.S new file mode 100644 index 00000000..ec0c8472 --- /dev/null +++ b/Sources/CCryptoBoringSSL/gen/crypto/md5-586-apple.S @@ -0,0 +1,689 @@ +#define BORINGSSL_PREFIX CCryptoBoringSSL +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _md5_block_asm_data_order +.private_extern _md5_block_asm_data_order +.align 4 +_md5_block_asm_data_order: +L_md5_block_asm_data_order_begin: + pushl %esi + pushl %edi + movl 12(%esp),%edi + movl 16(%esp),%esi + movl 20(%esp),%ecx + pushl %ebp + shll $6,%ecx + pushl %ebx + addl %esi,%ecx + subl $64,%ecx + movl (%edi),%eax + pushl %ecx + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx +L000start: + + # R0 section + movl %ecx,%edi + movl (%esi),%ebp + # R0 0 + xorl %edx,%edi + andl %ebx,%edi + leal 3614090360(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 4(%esi),%ebp + addl %ebx,%eax + # R0 1 + xorl %ecx,%edi + andl %eax,%edi + leal 3905402710(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 8(%esi),%ebp + addl %eax,%edx + # R0 2 + xorl %ebx,%edi + andl %edx,%edi + leal 606105819(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 12(%esi),%ebp + addl %edx,%ecx + # R0 3 + xorl %eax,%edi + andl %ecx,%edi + leal 3250441966(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 16(%esi),%ebp + addl %ecx,%ebx + # R0 4 + xorl %edx,%edi + andl %ebx,%edi + leal 4118548399(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 20(%esi),%ebp + addl %ebx,%eax + # R0 5 + xorl %ecx,%edi + andl %eax,%edi + leal 1200080426(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 24(%esi),%ebp + addl %eax,%edx + # R0 6 + xorl %ebx,%edi + andl %edx,%edi + leal 2821735955(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 28(%esi),%ebp + addl %edx,%ecx + # R0 7 + xorl %eax,%edi + andl %ecx,%edi + leal 4249261313(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 32(%esi),%ebp + addl %ecx,%ebx + # R0 8 + xorl %edx,%edi + andl %ebx,%edi + leal 1770035416(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 36(%esi),%ebp + addl %ebx,%eax + # R0 9 + xorl %ecx,%edi + andl %eax,%edi + leal 2336552879(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 40(%esi),%ebp + addl %eax,%edx + # R0 10 + xorl %ebx,%edi + andl %edx,%edi + leal 4294925233(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 44(%esi),%ebp + addl %edx,%ecx + # R0 11 + xorl %eax,%edi + andl %ecx,%edi + leal 2304563134(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 48(%esi),%ebp + addl %ecx,%ebx + # R0 12 + xorl %edx,%edi + andl %ebx,%edi + leal 1804603682(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 52(%esi),%ebp + addl %ebx,%eax + # R0 13 + xorl %ecx,%edi + andl %eax,%edi + leal 4254626195(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 56(%esi),%ebp + addl %eax,%edx + # R0 14 + xorl %ebx,%edi + andl %edx,%edi + leal 2792965006(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 60(%esi),%ebp + addl %edx,%ecx + # R0 15 + xorl %eax,%edi + andl %ecx,%edi + leal 1236535329(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 4(%esi),%ebp + addl %ecx,%ebx + + # R1 section + # R1 16 + leal 4129170786(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 24(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + # R1 17 + leal 3225465664(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 44(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + # R1 18 + leal 643717713(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl (%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + # R1 19 + leal 3921069994(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 20(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + # R1 20 + leal 3593408605(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 40(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + # R1 21 + leal 38016083(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 60(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + # R1 22 + leal 3634488961(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl 16(%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + # R1 23 + leal 3889429448(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 36(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + # R1 24 + leal 568446438(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 56(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + # R1 25 + leal 3275163606(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 12(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + # R1 26 + leal 4107603335(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl 32(%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + # R1 27 + leal 1163531501(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 52(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + # R1 28 + leal 2850285829(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 8(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + # R1 29 + leal 4243563512(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 28(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + # R1 30 + leal 1735328473(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl 48(%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + # R1 31 + leal 2368359562(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 20(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + + # R2 section + # R2 32 + xorl %edx,%edi + xorl %ebx,%edi + leal 4294588738(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl 32(%esi),%ebp + movl %ebx,%edi + # R2 33 + leal 2272392833(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 44(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + # R2 34 + xorl %ebx,%edi + xorl %edx,%edi + leal 1839030562(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 56(%esi),%ebp + movl %edx,%edi + # R2 35 + leal 4259657740(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl 4(%esi),%ebp + addl %edi,%ebx + movl %ecx,%edi + roll $23,%ebx + addl %ecx,%ebx + # R2 36 + xorl %edx,%edi + xorl %ebx,%edi + leal 2763975236(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl 16(%esi),%ebp + movl %ebx,%edi + # R2 37 + leal 1272893353(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 28(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + # R2 38 + xorl %ebx,%edi + xorl %edx,%edi + leal 4139469664(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 40(%esi),%ebp + movl %edx,%edi + # R2 39 + leal 3200236656(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl 52(%esi),%ebp + addl %edi,%ebx + movl %ecx,%edi + roll $23,%ebx + addl %ecx,%ebx + # R2 40 + xorl %edx,%edi + xorl %ebx,%edi + leal 681279174(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl (%esi),%ebp + movl %ebx,%edi + # R2 41 + leal 3936430074(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 12(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + # R2 42 + xorl %ebx,%edi + xorl %edx,%edi + leal 3572445317(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 24(%esi),%ebp + movl %edx,%edi + # R2 43 + leal 76029189(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl 36(%esi),%ebp + addl %edi,%ebx + movl %ecx,%edi + roll $23,%ebx + addl %ecx,%ebx + # R2 44 + xorl %edx,%edi + xorl %ebx,%edi + leal 3654602809(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl 48(%esi),%ebp + movl %ebx,%edi + # R2 45 + leal 3873151461(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 60(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + # R2 46 + xorl %ebx,%edi + xorl %edx,%edi + leal 530742520(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 8(%esi),%ebp + movl %edx,%edi + # R2 47 + leal 3299628645(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl (%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $23,%ebx + addl %ecx,%ebx + + # R3 section + # R3 48 + xorl %edx,%edi + orl %ebx,%edi + leal 4096336452(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 28(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + # R3 49 + orl %eax,%edi + leal 1126891415(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 56(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + # R3 50 + orl %edx,%edi + leal 2878612391(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 20(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + # R3 51 + orl %ecx,%edi + leal 4237533241(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 48(%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $21,%ebx + xorl %edx,%edi + addl %ecx,%ebx + # R3 52 + orl %ebx,%edi + leal 1700485571(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 12(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + # R3 53 + orl %eax,%edi + leal 2399980690(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 40(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + # R3 54 + orl %edx,%edi + leal 4293915773(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 4(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + # R3 55 + orl %ecx,%edi + leal 2240044497(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 32(%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $21,%ebx + xorl %edx,%edi + addl %ecx,%ebx + # R3 56 + orl %ebx,%edi + leal 1873313359(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 60(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + # R3 57 + orl %eax,%edi + leal 4264355552(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 24(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + # R3 58 + orl %edx,%edi + leal 2734768916(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 52(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + # R3 59 + orl %ecx,%edi + leal 1309151649(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 16(%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $21,%ebx + xorl %edx,%edi + addl %ecx,%ebx + # R3 60 + orl %ebx,%edi + leal 4149444226(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 44(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + # R3 61 + orl %eax,%edi + leal 3174756917(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 8(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + # R3 62 + orl %edx,%edi + leal 718787259(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 36(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + # R3 63 + orl %ecx,%edi + leal 3951481745(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 24(%esp),%ebp + addl %edi,%ebx + addl $64,%esi + roll $21,%ebx + movl (%ebp),%edi + addl %ecx,%ebx + addl %edi,%eax + movl 4(%ebp),%edi + addl %edi,%ebx + movl 8(%ebp),%edi + addl %edi,%ecx + movl 12(%ebp),%edi + addl %edi,%edx + movl %eax,(%ebp) + movl %ebx,4(%ebp) + movl (%esp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + cmpl %esi,%edi + jae L000start + popl %eax + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/crypto/md5-586-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-linux.linux.x86.S rename to Sources/CCryptoBoringSSL/gen/crypto/md5-586-linux.S index 6de63f48..ed1ac320 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-linux.linux.x86.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/md5-586-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__i386__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -686,7 +685,6 @@ md5_block_asm_data_order: ret .size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin #endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) -#endif // defined(__i386__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-apple.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-mac.mac.x86_64.S rename to Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-apple.S index 44871ebd..e5e052dd 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-mac.mac.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-apple.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__APPLE__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -690,7 +689,6 @@ L$epilogue: #endif -#endif // defined(__x86_64__) && defined(__APPLE__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-linux.S similarity index 99% rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-linux.linux.x86_64.S rename to Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-linux.S index f5556018..ffb7fb13 100644 --- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-linux.linux.x86_64.S +++ b/Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-linux.S @@ -1,5 +1,4 @@ #define BORINGSSL_PREFIX CCryptoBoringSSL -#if defined(__x86_64__) && defined(__linux__) // This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. @@ -695,7 +694,6 @@ _CET_ENDBR .cfi_endproc .size md5_block_asm_data_order,.-md5_block_asm_data_order #endif -#endif // defined(__x86_64__) && defined(__linux__) #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/Sources/CCryptoBoringSSL/hash.txt b/Sources/CCryptoBoringSSL/hash.txt index 74b3edbd..64113321 100644 --- a/Sources/CCryptoBoringSSL/hash.txt +++ b/Sources/CCryptoBoringSSL/hash.txt @@ -1 +1 @@ -This directory is derived from BoringSSL cloned from https://boringssl.googlesource.com/boringssl at revision dbad745811195c00b729efd0ee0a09b7d9fce1d2 +This directory is derived from BoringSSL cloned from https://boringssl.googlesource.com/boringssl at revision 6a2ccdcc2ed1d37a43a2183658d2ae61fd5ce208 diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL.h index ac689525..60637c3c 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL.h @@ -14,6 +14,7 @@ #ifndef C_CRYPTO_BORINGSSL_H #define C_CRYPTO_BORINGSSL_H +#include "CCryptoBoringSSL_aead.h" #include "CCryptoBoringSSL_aes.h" #include "CCryptoBoringSSL_arm_arch.h" #include "CCryptoBoringSSL_asn1_mac.h" @@ -22,6 +23,7 @@ #include "CCryptoBoringSSL_bio.h" #include "CCryptoBoringSSL_blake2.h" #include "CCryptoBoringSSL_blowfish.h" +#include "CCryptoBoringSSL_bn.h" #include "CCryptoBoringSSL_boringssl_prefix_symbols.h" #include "CCryptoBoringSSL_boringssl_prefix_symbols_asm.h" #include "CCryptoBoringSSL_cast.h" diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_asn1.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_asn1.h index dbf5b5b8..09ae0156 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_asn1.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_asn1.h @@ -468,7 +468,8 @@ DECLARE_ASN1_ITEM(ASN1_FBOOLEAN) // An asn1_string_st (aka |ASN1_STRING|) represents a value of a string-like // ASN.1 type. It contains a |type| field, and a byte string |data| field with a -// type-specific representation. +// type-specific representation. This type-specific representation does not +// always correspond to the DER encoding of the type. // // If |type| is one of |V_ASN1_OCTET_STRING|, |V_ASN1_UTF8STRING|, // |V_ASN1_NUMERICSTRING|, |V_ASN1_PRINTABLESTRING|, |V_ASN1_T61STRING|, @@ -568,6 +569,10 @@ OPENSSL_EXPORT int ASN1_STRING_type(const ASN1_STRING *str); // ASN1_STRING_get0_data returns a pointer to |str|'s contents. Callers should // use |ASN1_STRING_length| to determine the length of the string. The string // may have embedded NUL bytes and may not be NUL-terminated. +// +// The contents of an |ASN1_STRING| encode the value in some type-specific +// representation that does not always correspond to the DER encoding of the +// type. See the documentation for |ASN1_STRING| for details. OPENSSL_EXPORT const unsigned char *ASN1_STRING_get0_data( const ASN1_STRING *str); @@ -575,10 +580,18 @@ OPENSSL_EXPORT const unsigned char *ASN1_STRING_get0_data( // should use |ASN1_STRING_length| to determine the length of the string. The // string may have embedded NUL bytes and may not be NUL-terminated. // +// The contents of an |ASN1_STRING| encode the value in some type-specific +// representation that does not always correspond to the DER encoding of the +// type. See the documentation for |ASN1_STRING| for details. +// // Prefer |ASN1_STRING_get0_data|. OPENSSL_EXPORT unsigned char *ASN1_STRING_data(ASN1_STRING *str); // ASN1_STRING_length returns the length of |str|, in bytes. +// +// The contents of an |ASN1_STRING| encode the value in some type-specific +// representation that does not always correspond to the DER encoding of the +// type. See the documentation for |ASN1_STRING| for details. OPENSSL_EXPORT int ASN1_STRING_length(const ASN1_STRING *str); // ASN1_STRING_cmp compares |a| and |b|'s type and contents. It returns an diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_base.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_base.h index bf897fd5..1a02695d 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_base.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_base.h @@ -189,6 +189,13 @@ extern "C" { #define OPENSSL_PRINTF_FORMAT_FUNC(string_index, first_to_check) #endif +// OPENSSL_CLANG_PRAGMA emits a pragma on clang and nothing on other compilers. +#if defined(__clang__) +#define OPENSSL_CLANG_PRAGMA(arg) _Pragma(arg) +#else +#define OPENSSL_CLANG_PRAGMA(arg) +#endif + // OPENSSL_MSVC_PRAGMA emits a pragma on MSVC and nothing on other compilers. #if defined(_MSC_VER) #define OPENSSL_MSVC_PRAGMA(arg) __pragma(arg) diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bcm_public.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bcm_public.h new file mode 100644 index 00000000..89c45765 --- /dev/null +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bcm_public.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2024, Google LLC + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_BCM_PUBLIC_H_ +#define OPENSSL_HEADER_BCM_PUBLIC_H_ + +#include "CCryptoBoringSSL_base.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +// Public types referenced by BoringCrypto +// +// This header contains public types referenced by BCM. Such types are difficult +// to hide from the libcrypto interface, so we treat them as part of BCM. + +// BCM_SHA_CBLOCK is the block size of SHA-1. +#define BCM_SHA_CBLOCK 64 + +// SHA_CTX +struct sha_state_st { +#if defined(__cplusplus) || defined(OPENSSL_WINDOWS) + uint32_t h[5]; +#else + // wpa_supplicant accesses |h0|..|h4| so we must support those names for + // compatibility with it until it can be updated. Anonymous unions are only + // standard in C11, so disable this workaround in C++. + union { + uint32_t h[5]; + struct { + uint32_t h0; + uint32_t h1; + uint32_t h2; + uint32_t h3; + uint32_t h4; + }; + }; +#endif + uint32_t Nl, Nh; + uint8_t data[BCM_SHA_CBLOCK]; + unsigned num; +}; + + +#if defined(__cplusplus) +} // extern C +#endif + +#endif // OPENSSL_HEADER_BCM_PUBLIC_H_ diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bio.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bio.h index 839bf205..0e6bab06 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bio.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bio.h @@ -714,33 +714,35 @@ OPENSSL_EXPORT void BIO_meth_free(BIO_METHOD *method); // and returns one. The function should return one on success and zero on // error. OPENSSL_EXPORT int BIO_meth_set_create(BIO_METHOD *method, - int (*create)(BIO *)); + int (*create_func)(BIO *)); // BIO_meth_set_destroy sets a function to release data associated with a |BIO| // and returns one. The function's return value is ignored. OPENSSL_EXPORT int BIO_meth_set_destroy(BIO_METHOD *method, - int (*destroy)(BIO *)); + int (*destroy_func)(BIO *)); // BIO_meth_set_write sets the implementation of |BIO_write| for |method| and // returns one. |BIO_METHOD|s which implement |BIO_write| should also implement // |BIO_CTRL_FLUSH|. (See |BIO_meth_set_ctrl|.) OPENSSL_EXPORT int BIO_meth_set_write(BIO_METHOD *method, - int (*write)(BIO *, const char *, int)); + int (*write_func)(BIO *, const char *, + int)); // BIO_meth_set_read sets the implementation of |BIO_read| for |method| and // returns one. OPENSSL_EXPORT int BIO_meth_set_read(BIO_METHOD *method, - int (*read)(BIO *, char *, int)); + int (*read_func)(BIO *, char *, int)); // BIO_meth_set_gets sets the implementation of |BIO_gets| for |method| and // returns one. OPENSSL_EXPORT int BIO_meth_set_gets(BIO_METHOD *method, - int (*gets)(BIO *, char *, int)); + int (*gets_func)(BIO *, char *, int)); // BIO_meth_set_ctrl sets the implementation of |BIO_ctrl| for |method| and // returns one. OPENSSL_EXPORT int BIO_meth_set_ctrl(BIO_METHOD *method, - long (*ctrl)(BIO *, int, long, void *)); + long (*ctrl_func)(BIO *, int, long, + void *)); // BIO_set_data sets custom data on |bio|. It may be retried with // |BIO_get_data|. diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bn.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bn.h index a93aaa5e..013bc10e 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bn.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bn.h @@ -388,9 +388,9 @@ OPENSSL_EXPORT void BN_CTX_end(BN_CTX *ctx); // or |b|. It returns one on success and zero on allocation failure. OPENSSL_EXPORT int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b); -// BN_uadd sets |r| = |a| + |b|, where |a| and |b| are non-negative and |r| may -// be the same pointer as either |a| or |b|. It returns one on success and zero -// on allocation failure. +// BN_uadd sets |r| = |a| + |b|, considering only the absolute values of |a| and +// |b|. |r| may be the same pointer as either |a| or |b|. It returns one on +// success and zero on allocation failure. OPENSSL_EXPORT int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b); // BN_add_word adds |w| to |a|. It returns one on success and zero otherwise. @@ -400,9 +400,9 @@ OPENSSL_EXPORT int BN_add_word(BIGNUM *a, BN_ULONG w); // or |b|. It returns one on success and zero on allocation failure. OPENSSL_EXPORT int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b); -// BN_usub sets |r| = |a| - |b|, where |a| and |b| are non-negative integers, -// |b| < |a| and |r| may be the same pointer as either |a| or |b|. It returns -// one on success and zero on allocation failure. +// BN_usub sets |r| = |a| - |b|, considering only the absolute values of |a| and +// |b|. The result must be non-negative, i.e. |b| <= |a|. |r| may be the same +// pointer as either |a| or |b|. It returns one on success and zero on error. OPENSSL_EXPORT int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b); // BN_sub_word subtracts |w| from |a|. It returns one on success and zero on @@ -425,9 +425,14 @@ OPENSSL_EXPORT int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx); // BN_div divides |numerator| by |divisor| and places the result in |quotient| // and the remainder in |rem|. Either of |quotient| or |rem| may be NULL, in -// which case the respective value is not returned. The result is rounded -// towards zero; thus if |numerator| is negative, the remainder will be zero or -// negative. It returns one on success or zero on error. +// which case the respective value is not returned. It returns one on success or +// zero on error. It is an error condition if |divisor| is zero. +// +// The outputs will be such that |quotient| * |divisor| + |rem| = |numerator|, +// with the quotient rounded towards zero. Thus, if |numerator| is negative, +// |rem| will be zero or negative. If |divisor| is negative, the sign of +// |quotient| will be flipped to compensate but otherwise rounding will be as if +// |divisor| were its absolute value. OPENSSL_EXPORT int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator, const BIGNUM *divisor, BN_CTX *ctx); diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols.h index 5cb6e422..7ec8370c 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols.h @@ -210,6 +210,14 @@ #define BASIC_CONSTRAINTS_free BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_free) #define BASIC_CONSTRAINTS_it BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_it) #define BASIC_CONSTRAINTS_new BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_new) +#define BCM_fips_186_2_prf BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_fips_186_2_prf) +#define BCM_rand_bytes BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_rand_bytes) +#define BCM_rand_bytes_hwrng BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_rand_bytes_hwrng) +#define BCM_rand_bytes_with_additional_data BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_rand_bytes_with_additional_data) +#define BCM_sha1_final BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_sha1_final) +#define BCM_sha1_init BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_sha1_init) +#define BCM_sha1_transform BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_sha1_transform) +#define BCM_sha1_update BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_sha1_update) #define BIO_append_filename BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BIO_append_filename) #define BIO_callback_ctrl BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BIO_callback_ctrl) #define BIO_clear_flags BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BIO_clear_flags) @@ -716,6 +724,16 @@ #define DH_size BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DH_size) #define DH_up_ref BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DH_up_ref) #define DHparams_dup BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DHparams_dup) +#define DILITHIUM_generate_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_generate_key) +#define DILITHIUM_generate_key_external_entropy BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_generate_key_external_entropy) +#define DILITHIUM_marshal_private_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_marshal_private_key) +#define DILITHIUM_marshal_public_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_marshal_public_key) +#define DILITHIUM_parse_private_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_parse_private_key) +#define DILITHIUM_parse_public_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_parse_public_key) +#define DILITHIUM_public_from_private BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_public_from_private) +#define DILITHIUM_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_sign) +#define DILITHIUM_sign_deterministic BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_sign_deterministic) +#define DILITHIUM_verify BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_verify) #define DIRECTORYSTRING_free BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DIRECTORYSTRING_free) #define DIRECTORYSTRING_it BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DIRECTORYSTRING_it) #define DIRECTORYSTRING_new BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DIRECTORYSTRING_new) @@ -1097,6 +1115,7 @@ #define EVP_PKEY_CTX_set0_rsa_oaep_label BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set0_rsa_oaep_label) #define EVP_PKEY_CTX_set1_hkdf_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set1_hkdf_key) #define EVP_PKEY_CTX_set1_hkdf_salt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set1_hkdf_salt) +#define EVP_PKEY_CTX_set_dh_pad BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dh_pad) #define EVP_PKEY_CTX_set_dsa_paramgen_bits BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dsa_paramgen_bits) #define EVP_PKEY_CTX_set_dsa_paramgen_q_bits BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dsa_paramgen_q_bits) #define EVP_PKEY_CTX_set_ec_param_enc BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_ec_param_enc) @@ -1113,6 +1132,7 @@ #define EVP_PKEY_CTX_set_rsa_pss_saltlen BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_rsa_pss_saltlen) #define EVP_PKEY_CTX_set_signature_md BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_signature_md) #define EVP_PKEY_assign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_assign) +#define EVP_PKEY_assign_DH BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_assign_DH) #define EVP_PKEY_assign_DSA BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_assign_DSA) #define EVP_PKEY_assign_EC_KEY BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_assign_EC_KEY) #define EVP_PKEY_assign_RSA BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_assign_RSA) @@ -1154,6 +1174,7 @@ #define EVP_PKEY_print_params BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_print_params) #define EVP_PKEY_print_private BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_print_private) #define EVP_PKEY_print_public BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_print_public) +#define EVP_PKEY_set1_DH BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_set1_DH) #define EVP_PKEY_set1_DSA BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_set1_DSA) #define EVP_PKEY_set1_EC_KEY BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_set1_EC_KEY) #define EVP_PKEY_set1_RSA BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_set1_RSA) @@ -1238,6 +1259,7 @@ #define EVP_hpke_aes_256_gcm BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_hpke_aes_256_gcm) #define EVP_hpke_chacha20_poly1305 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_hpke_chacha20_poly1305) #define EVP_hpke_hkdf_sha256 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_hpke_hkdf_sha256) +#define EVP_hpke_p256_hkdf_sha256 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_hpke_p256_hkdf_sha256) #define EVP_hpke_x25519_hkdf_sha256 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_hpke_x25519_hkdf_sha256) #define EVP_marshal_digest_algorithm BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_marshal_digest_algorithm) #define EVP_marshal_private_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_marshal_private_key) @@ -1340,6 +1362,18 @@ #define MD5_Update BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MD5_Update) #define METHOD_ref BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, METHOD_ref) #define METHOD_unref BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, METHOD_unref) +#define MLDSA65_generate_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_generate_key) +#define MLDSA65_generate_key_external_entropy BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_generate_key_external_entropy) +#define MLDSA65_marshal_private_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_marshal_private_key) +#define MLDSA65_marshal_public_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_marshal_public_key) +#define MLDSA65_parse_private_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_parse_private_key) +#define MLDSA65_parse_public_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_parse_public_key) +#define MLDSA65_private_key_from_seed BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_private_key_from_seed) +#define MLDSA65_public_from_private BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_public_from_private) +#define MLDSA65_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_sign) +#define MLDSA65_sign_internal BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_sign_internal) +#define MLDSA65_verify BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_verify) +#define MLDSA65_verify_internal BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_verify_internal) #define NAME_CONSTRAINTS_check BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NAME_CONSTRAINTS_check) #define NAME_CONSTRAINTS_free BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NAME_CONSTRAINTS_free) #define NAME_CONSTRAINTS_it BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NAME_CONSTRAINTS_it) @@ -1365,6 +1399,8 @@ #define NOTICEREF_free BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NOTICEREF_free) #define NOTICEREF_it BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NOTICEREF_it) #define NOTICEREF_new BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NOTICEREF_new) +#define OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER) +#define OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER) #define OBJ_cbs2nid BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OBJ_cbs2nid) #define OBJ_cleanup BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OBJ_cleanup) #define OBJ_cmp BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OBJ_cmp) @@ -1404,6 +1440,7 @@ #define OPENSSL_gmtime_diff BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_gmtime_diff) #define OPENSSL_hash32 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_hash32) #define OPENSSL_ia32cap_P BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_ia32cap_P) +#define OPENSSL_init_cpuid BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_init_cpuid) #define OPENSSL_init_crypto BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_init_crypto) #define OPENSSL_isalnum BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_isalnum) #define OPENSSL_isalpha BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_isalpha) @@ -1613,7 +1650,6 @@ #define RAND_SSLeay BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_SSLeay) #define RAND_add BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_add) #define RAND_bytes BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_bytes) -#define RAND_bytes_with_additional_data BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_bytes_with_additional_data) #define RAND_cleanup BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_cleanup) #define RAND_disable_fork_unsafe_buffering BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_disable_fork_unsafe_buffering) #define RAND_egd BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_egd) @@ -1737,6 +1773,10 @@ #define SPAKE2_CTX_new BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPAKE2_CTX_new) #define SPAKE2_generate_msg BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPAKE2_generate_msg) #define SPAKE2_process_msg BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPAKE2_process_msg) +#define SPX_generate_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPX_generate_key) +#define SPX_generate_key_from_seed BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPX_generate_key_from_seed) +#define SPX_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPX_sign) +#define SPX_verify BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPX_verify) #define SSLeay BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SSLeay) #define SSLeay_version BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SSLeay_version) #define TRUST_TOKEN_CLIENT_add_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, TRUST_TOKEN_CLIENT_add_key) @@ -2065,11 +2105,9 @@ #define X509_STORE_load_locations BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_load_locations) #define X509_STORE_new BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_new) #define X509_STORE_set1_param BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set1_param) -#define X509_STORE_set_check_crl BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_check_crl) #define X509_STORE_set_default_paths BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_default_paths) #define X509_STORE_set_depth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_depth) #define X509_STORE_set_flags BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_flags) -#define X509_STORE_set_get_crl BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_get_crl) #define X509_STORE_set_purpose BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_purpose) #define X509_STORE_set_trust BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_trust) #define X509_STORE_set_verify_cb BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_verify_cb) @@ -2249,8 +2287,11 @@ #define aes_hw_decrypt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_decrypt) #define aes_hw_ecb_encrypt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_ecb_encrypt) #define aes_hw_encrypt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_encrypt) +#define aes_hw_encrypt_key_to_decrypt_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_encrypt_key_to_decrypt_key) #define aes_hw_set_decrypt_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_set_decrypt_key) #define aes_hw_set_encrypt_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_set_encrypt_key) +#define aes_hw_set_encrypt_key_alt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_set_encrypt_key_alt) +#define aes_hw_set_encrypt_key_base BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_set_encrypt_key_base) #define aes_nohw_cbc_encrypt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_nohw_cbc_encrypt) #define aes_nohw_ctr32_encrypt_blocks BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_nohw_ctr32_encrypt_blocks) #define aes_nohw_decrypt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_nohw_decrypt) @@ -2322,19 +2363,22 @@ #define bn_mont_ctx_set_RR_consttime BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mont_ctx_set_RR_consttime) #define bn_mont_n0 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mont_n0) #define bn_mul4x_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul4x_mont) +#define bn_mul4x_mont_gather5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul4x_mont_gather5) #define bn_mul_add_words BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_add_words) #define bn_mul_comba4 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_comba4) #define bn_mul_comba8 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_comba8) #define bn_mul_consttime BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_consttime) #define bn_mul_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_mont) -#define bn_mul_mont_gather5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_mont_gather5) +#define bn_mul_mont_gather5_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_mont_gather5_nohw) #define bn_mul_mont_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_mont_nohw) #define bn_mul_small BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_small) #define bn_mul_words BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_words) #define bn_mulx4x_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mulx4x_mont) +#define bn_mulx4x_mont_gather5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mulx4x_mont_gather5) #define bn_odd_number_is_obviously_composite BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_odd_number_is_obviously_composite) #define bn_one_to_montgomery BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_one_to_montgomery) -#define bn_power5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_power5) +#define bn_power5_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_power5_nohw) +#define bn_powerx5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_powerx5) #define bn_rand_range_words BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_rand_range_words) #define bn_rand_secret_range BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_rand_secret_range) #define bn_reduce_once BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_reduce_once) @@ -2369,7 +2413,11 @@ #define c2i_ASN1_INTEGER BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, c2i_ASN1_INTEGER) #define c2i_ASN1_OBJECT BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, c2i_ASN1_OBJECT) #define chacha20_poly1305_open BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_open) +#define chacha20_poly1305_open_avx2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_open_avx2) +#define chacha20_poly1305_open_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_open_nohw) #define chacha20_poly1305_seal BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_seal) +#define chacha20_poly1305_seal_avx2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_seal_avx2) +#define chacha20_poly1305_seal_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_seal_nohw) #define crypto_gcm_clmul_enabled BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, crypto_gcm_clmul_enabled) #define d2i_ASN1_BIT_STRING BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, d2i_ASN1_BIT_STRING) #define d2i_ASN1_BMPSTRING BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, d2i_ASN1_BMPSTRING) @@ -2478,8 +2526,10 @@ #define d2i_X509_VAL BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, d2i_X509_VAL) #define d2i_X509_bio BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, d2i_X509_bio) #define d2i_X509_fp BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, d2i_X509_fp) +#define dh_asn1_meth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dh_asn1_meth) #define dh_check_params_fast BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dh_check_params_fast) #define dh_compute_key_padded_no_self_test BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dh_compute_key_padded_no_self_test) +#define dh_pkey_meth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dh_pkey_meth) #define dsa_asn1_meth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dsa_asn1_meth) #define dsa_check_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dsa_check_key) #define ec_GFp_mont_add BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ec_GFp_mont_add) @@ -2569,25 +2619,46 @@ #define ec_set_to_safe_point BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ec_set_to_safe_point) #define ec_simple_scalar_inv0_montgomery BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ec_simple_scalar_inv0_montgomery) #define ec_simple_scalar_to_montgomery_inv_vartime BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ec_simple_scalar_to_montgomery_inv_vartime) -#define ecdsa_do_verify_no_self_test BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_do_verify_no_self_test) -#define ecdsa_sign_with_nonce_for_known_answer_test BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_sign_with_nonce_for_known_answer_test) -#define ecp_nistz256_avx2_select_w7 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_avx2_select_w7) +#define ecdsa_sign_fixed BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_sign_fixed) +#define ecdsa_sign_fixed_with_nonce_for_known_answer_test BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_sign_fixed_with_nonce_for_known_answer_test) +#define ecdsa_verify_fixed BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_verify_fixed) +#define ecdsa_verify_fixed_no_self_test BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_verify_fixed_no_self_test) #define ecp_nistz256_div_by_2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_div_by_2) #define ecp_nistz256_mul_by_2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_mul_by_2) #define ecp_nistz256_mul_by_3 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_mul_by_3) #define ecp_nistz256_mul_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_mul_mont) +#define ecp_nistz256_mul_mont_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_mul_mont_adx) +#define ecp_nistz256_mul_mont_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_mul_mont_nohw) #define ecp_nistz256_neg BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_neg) #define ecp_nistz256_ord_mul_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont) +#define ecp_nistz256_ord_mul_mont_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont_adx) +#define ecp_nistz256_ord_mul_mont_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont_nohw) #define ecp_nistz256_ord_sqr_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont) +#define ecp_nistz256_ord_sqr_mont_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont_adx) +#define ecp_nistz256_ord_sqr_mont_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont_nohw) #define ecp_nistz256_point_add BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add) +#define ecp_nistz256_point_add_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add_adx) #define ecp_nistz256_point_add_affine BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine) +#define ecp_nistz256_point_add_affine_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine_adx) +#define ecp_nistz256_point_add_affine_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine_nohw) +#define ecp_nistz256_point_add_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add_nohw) #define ecp_nistz256_point_double BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_double) +#define ecp_nistz256_point_double_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_double_adx) +#define ecp_nistz256_point_double_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_double_nohw) #define ecp_nistz256_select_w5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w5) +#define ecp_nistz256_select_w5_avx2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w5_avx2) +#define ecp_nistz256_select_w5_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w5_nohw) #define ecp_nistz256_select_w7 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w7) +#define ecp_nistz256_select_w7_avx2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w7_avx2) +#define ecp_nistz256_select_w7_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w7_nohw) #define ecp_nistz256_sqr_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont) +#define ecp_nistz256_sqr_mont_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont_adx) +#define ecp_nistz256_sqr_mont_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont_nohw) #define ecp_nistz256_sub BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_sub) #define ed25519_asn1_meth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ed25519_asn1_meth) #define ed25519_pkey_meth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ed25519_pkey_meth) +#define evp_md_md5_sha1 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, evp_md_md5_sha1) +#define evp_pkey_set_method BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, evp_pkey_set_method) #define fiat_curve25519_adx_mul BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, fiat_curve25519_adx_mul) #define fiat_curve25519_adx_square BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, fiat_curve25519_adx_square) #define fiat_p256_adx_mul BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, fiat_p256_adx_mul) @@ -2829,8 +2900,6 @@ #define spx_fors_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_fors_sign) #define spx_fors_sk_gen BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_fors_sk_gen) #define spx_fors_treehash BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_fors_treehash) -#define spx_generate_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_generate_key) -#define spx_generate_key_from_seed BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_generate_key_from_seed) #define spx_get_tree_index BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_get_tree_index) #define spx_ht_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_ht_sign) #define spx_ht_verify BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_ht_verify) @@ -2842,7 +2911,6 @@ #define spx_set_tree_height BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_set_tree_height) #define spx_set_tree_index BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_set_tree_index) #define spx_set_type BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_set_type) -#define spx_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_sign) #define spx_thash_f BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_thash_f) #define spx_thash_h BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_thash_h) #define spx_thash_hmsg BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_thash_hmsg) @@ -2853,12 +2921,12 @@ #define spx_to_uint64 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_to_uint64) #define spx_treehash BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_treehash) #define spx_uint64_to_len_bytes BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_uint64_to_len_bytes) -#define spx_verify BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_verify) #define spx_wots_pk_from_sig BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_wots_pk_from_sig) #define spx_wots_pk_gen BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_wots_pk_gen) #define spx_wots_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_wots_sign) #define spx_xmss_pk_from_sig BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_xmss_pk_from_sig) #define spx_xmss_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_xmss_sign) +#define swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE) #define v2i_GENERAL_NAME BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, v2i_GENERAL_NAME) #define v2i_GENERAL_NAMES BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, v2i_GENERAL_NAMES) #define v2i_GENERAL_NAME_ex BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, v2i_GENERAL_NAME_ex) diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols_asm.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols_asm.h index 3f18f9d2..2d0fad13 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols_asm.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols_asm.h @@ -215,6 +215,14 @@ #define _BASIC_CONSTRAINTS_free BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_free) #define _BASIC_CONSTRAINTS_it BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_it) #define _BASIC_CONSTRAINTS_new BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_new) +#define _BCM_fips_186_2_prf BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_fips_186_2_prf) +#define _BCM_rand_bytes BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_rand_bytes) +#define _BCM_rand_bytes_hwrng BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_rand_bytes_hwrng) +#define _BCM_rand_bytes_with_additional_data BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_rand_bytes_with_additional_data) +#define _BCM_sha1_final BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_sha1_final) +#define _BCM_sha1_init BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_sha1_init) +#define _BCM_sha1_transform BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_sha1_transform) +#define _BCM_sha1_update BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_sha1_update) #define _BIO_append_filename BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BIO_append_filename) #define _BIO_callback_ctrl BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BIO_callback_ctrl) #define _BIO_clear_flags BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BIO_clear_flags) @@ -721,6 +729,16 @@ #define _DH_size BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DH_size) #define _DH_up_ref BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DH_up_ref) #define _DHparams_dup BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DHparams_dup) +#define _DILITHIUM_generate_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_generate_key) +#define _DILITHIUM_generate_key_external_entropy BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_generate_key_external_entropy) +#define _DILITHIUM_marshal_private_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_marshal_private_key) +#define _DILITHIUM_marshal_public_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_marshal_public_key) +#define _DILITHIUM_parse_private_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_parse_private_key) +#define _DILITHIUM_parse_public_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_parse_public_key) +#define _DILITHIUM_public_from_private BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_public_from_private) +#define _DILITHIUM_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_sign) +#define _DILITHIUM_sign_deterministic BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_sign_deterministic) +#define _DILITHIUM_verify BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_verify) #define _DIRECTORYSTRING_free BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DIRECTORYSTRING_free) #define _DIRECTORYSTRING_it BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DIRECTORYSTRING_it) #define _DIRECTORYSTRING_new BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DIRECTORYSTRING_new) @@ -1102,6 +1120,7 @@ #define _EVP_PKEY_CTX_set0_rsa_oaep_label BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set0_rsa_oaep_label) #define _EVP_PKEY_CTX_set1_hkdf_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set1_hkdf_key) #define _EVP_PKEY_CTX_set1_hkdf_salt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set1_hkdf_salt) +#define _EVP_PKEY_CTX_set_dh_pad BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dh_pad) #define _EVP_PKEY_CTX_set_dsa_paramgen_bits BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dsa_paramgen_bits) #define _EVP_PKEY_CTX_set_dsa_paramgen_q_bits BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dsa_paramgen_q_bits) #define _EVP_PKEY_CTX_set_ec_param_enc BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_ec_param_enc) @@ -1118,6 +1137,7 @@ #define _EVP_PKEY_CTX_set_rsa_pss_saltlen BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_rsa_pss_saltlen) #define _EVP_PKEY_CTX_set_signature_md BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_signature_md) #define _EVP_PKEY_assign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_assign) +#define _EVP_PKEY_assign_DH BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_assign_DH) #define _EVP_PKEY_assign_DSA BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_assign_DSA) #define _EVP_PKEY_assign_EC_KEY BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_assign_EC_KEY) #define _EVP_PKEY_assign_RSA BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_assign_RSA) @@ -1159,6 +1179,7 @@ #define _EVP_PKEY_print_params BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_print_params) #define _EVP_PKEY_print_private BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_print_private) #define _EVP_PKEY_print_public BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_print_public) +#define _EVP_PKEY_set1_DH BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_set1_DH) #define _EVP_PKEY_set1_DSA BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_set1_DSA) #define _EVP_PKEY_set1_EC_KEY BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_set1_EC_KEY) #define _EVP_PKEY_set1_RSA BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_set1_RSA) @@ -1243,6 +1264,7 @@ #define _EVP_hpke_aes_256_gcm BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_hpke_aes_256_gcm) #define _EVP_hpke_chacha20_poly1305 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_hpke_chacha20_poly1305) #define _EVP_hpke_hkdf_sha256 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_hpke_hkdf_sha256) +#define _EVP_hpke_p256_hkdf_sha256 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_hpke_p256_hkdf_sha256) #define _EVP_hpke_x25519_hkdf_sha256 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_hpke_x25519_hkdf_sha256) #define _EVP_marshal_digest_algorithm BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_marshal_digest_algorithm) #define _EVP_marshal_private_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_marshal_private_key) @@ -1345,6 +1367,18 @@ #define _MD5_Update BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MD5_Update) #define _METHOD_ref BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, METHOD_ref) #define _METHOD_unref BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, METHOD_unref) +#define _MLDSA65_generate_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_generate_key) +#define _MLDSA65_generate_key_external_entropy BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_generate_key_external_entropy) +#define _MLDSA65_marshal_private_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_marshal_private_key) +#define _MLDSA65_marshal_public_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_marshal_public_key) +#define _MLDSA65_parse_private_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_parse_private_key) +#define _MLDSA65_parse_public_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_parse_public_key) +#define _MLDSA65_private_key_from_seed BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_private_key_from_seed) +#define _MLDSA65_public_from_private BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_public_from_private) +#define _MLDSA65_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_sign) +#define _MLDSA65_sign_internal BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_sign_internal) +#define _MLDSA65_verify BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_verify) +#define _MLDSA65_verify_internal BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_verify_internal) #define _NAME_CONSTRAINTS_check BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NAME_CONSTRAINTS_check) #define _NAME_CONSTRAINTS_free BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NAME_CONSTRAINTS_free) #define _NAME_CONSTRAINTS_it BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NAME_CONSTRAINTS_it) @@ -1370,6 +1404,8 @@ #define _NOTICEREF_free BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NOTICEREF_free) #define _NOTICEREF_it BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NOTICEREF_it) #define _NOTICEREF_new BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NOTICEREF_new) +#define _OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER) +#define _OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER) #define _OBJ_cbs2nid BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OBJ_cbs2nid) #define _OBJ_cleanup BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OBJ_cleanup) #define _OBJ_cmp BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OBJ_cmp) @@ -1409,6 +1445,7 @@ #define _OPENSSL_gmtime_diff BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_gmtime_diff) #define _OPENSSL_hash32 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_hash32) #define _OPENSSL_ia32cap_P BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_ia32cap_P) +#define _OPENSSL_init_cpuid BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_init_cpuid) #define _OPENSSL_init_crypto BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_init_crypto) #define _OPENSSL_isalnum BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_isalnum) #define _OPENSSL_isalpha BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_isalpha) @@ -1618,7 +1655,6 @@ #define _RAND_SSLeay BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_SSLeay) #define _RAND_add BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_add) #define _RAND_bytes BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_bytes) -#define _RAND_bytes_with_additional_data BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_bytes_with_additional_data) #define _RAND_cleanup BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_cleanup) #define _RAND_disable_fork_unsafe_buffering BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_disable_fork_unsafe_buffering) #define _RAND_egd BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_egd) @@ -1742,6 +1778,10 @@ #define _SPAKE2_CTX_new BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPAKE2_CTX_new) #define _SPAKE2_generate_msg BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPAKE2_generate_msg) #define _SPAKE2_process_msg BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPAKE2_process_msg) +#define _SPX_generate_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPX_generate_key) +#define _SPX_generate_key_from_seed BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPX_generate_key_from_seed) +#define _SPX_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPX_sign) +#define _SPX_verify BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPX_verify) #define _SSLeay BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SSLeay) #define _SSLeay_version BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SSLeay_version) #define _TRUST_TOKEN_CLIENT_add_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, TRUST_TOKEN_CLIENT_add_key) @@ -2070,11 +2110,9 @@ #define _X509_STORE_load_locations BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_load_locations) #define _X509_STORE_new BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_new) #define _X509_STORE_set1_param BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set1_param) -#define _X509_STORE_set_check_crl BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_check_crl) #define _X509_STORE_set_default_paths BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_default_paths) #define _X509_STORE_set_depth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_depth) #define _X509_STORE_set_flags BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_flags) -#define _X509_STORE_set_get_crl BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_get_crl) #define _X509_STORE_set_purpose BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_purpose) #define _X509_STORE_set_trust BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_trust) #define _X509_STORE_set_verify_cb BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_verify_cb) @@ -2254,8 +2292,11 @@ #define _aes_hw_decrypt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_decrypt) #define _aes_hw_ecb_encrypt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_ecb_encrypt) #define _aes_hw_encrypt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_encrypt) +#define _aes_hw_encrypt_key_to_decrypt_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_encrypt_key_to_decrypt_key) #define _aes_hw_set_decrypt_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_set_decrypt_key) #define _aes_hw_set_encrypt_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_set_encrypt_key) +#define _aes_hw_set_encrypt_key_alt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_set_encrypt_key_alt) +#define _aes_hw_set_encrypt_key_base BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_set_encrypt_key_base) #define _aes_nohw_cbc_encrypt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_nohw_cbc_encrypt) #define _aes_nohw_ctr32_encrypt_blocks BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_nohw_ctr32_encrypt_blocks) #define _aes_nohw_decrypt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_nohw_decrypt) @@ -2327,19 +2368,22 @@ #define _bn_mont_ctx_set_RR_consttime BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mont_ctx_set_RR_consttime) #define _bn_mont_n0 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mont_n0) #define _bn_mul4x_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul4x_mont) +#define _bn_mul4x_mont_gather5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul4x_mont_gather5) #define _bn_mul_add_words BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_add_words) #define _bn_mul_comba4 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_comba4) #define _bn_mul_comba8 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_comba8) #define _bn_mul_consttime BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_consttime) #define _bn_mul_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_mont) -#define _bn_mul_mont_gather5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_mont_gather5) +#define _bn_mul_mont_gather5_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_mont_gather5_nohw) #define _bn_mul_mont_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_mont_nohw) #define _bn_mul_small BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_small) #define _bn_mul_words BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_words) #define _bn_mulx4x_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mulx4x_mont) +#define _bn_mulx4x_mont_gather5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mulx4x_mont_gather5) #define _bn_odd_number_is_obviously_composite BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_odd_number_is_obviously_composite) #define _bn_one_to_montgomery BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_one_to_montgomery) -#define _bn_power5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_power5) +#define _bn_power5_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_power5_nohw) +#define _bn_powerx5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_powerx5) #define _bn_rand_range_words BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_rand_range_words) #define _bn_rand_secret_range BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_rand_secret_range) #define _bn_reduce_once BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_reduce_once) @@ -2374,7 +2418,11 @@ #define _c2i_ASN1_INTEGER BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, c2i_ASN1_INTEGER) #define _c2i_ASN1_OBJECT BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, c2i_ASN1_OBJECT) #define _chacha20_poly1305_open BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_open) +#define _chacha20_poly1305_open_avx2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_open_avx2) +#define _chacha20_poly1305_open_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_open_nohw) #define _chacha20_poly1305_seal BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_seal) +#define _chacha20_poly1305_seal_avx2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_seal_avx2) +#define _chacha20_poly1305_seal_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_seal_nohw) #define _crypto_gcm_clmul_enabled BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, crypto_gcm_clmul_enabled) #define _d2i_ASN1_BIT_STRING BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, d2i_ASN1_BIT_STRING) #define _d2i_ASN1_BMPSTRING BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, d2i_ASN1_BMPSTRING) @@ -2483,8 +2531,10 @@ #define _d2i_X509_VAL BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, d2i_X509_VAL) #define _d2i_X509_bio BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, d2i_X509_bio) #define _d2i_X509_fp BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, d2i_X509_fp) +#define _dh_asn1_meth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dh_asn1_meth) #define _dh_check_params_fast BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dh_check_params_fast) #define _dh_compute_key_padded_no_self_test BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dh_compute_key_padded_no_self_test) +#define _dh_pkey_meth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dh_pkey_meth) #define _dsa_asn1_meth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dsa_asn1_meth) #define _dsa_check_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dsa_check_key) #define _ec_GFp_mont_add BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ec_GFp_mont_add) @@ -2574,25 +2624,46 @@ #define _ec_set_to_safe_point BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ec_set_to_safe_point) #define _ec_simple_scalar_inv0_montgomery BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ec_simple_scalar_inv0_montgomery) #define _ec_simple_scalar_to_montgomery_inv_vartime BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ec_simple_scalar_to_montgomery_inv_vartime) -#define _ecdsa_do_verify_no_self_test BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_do_verify_no_self_test) -#define _ecdsa_sign_with_nonce_for_known_answer_test BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_sign_with_nonce_for_known_answer_test) -#define _ecp_nistz256_avx2_select_w7 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_avx2_select_w7) +#define _ecdsa_sign_fixed BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_sign_fixed) +#define _ecdsa_sign_fixed_with_nonce_for_known_answer_test BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_sign_fixed_with_nonce_for_known_answer_test) +#define _ecdsa_verify_fixed BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_verify_fixed) +#define _ecdsa_verify_fixed_no_self_test BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_verify_fixed_no_self_test) #define _ecp_nistz256_div_by_2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_div_by_2) #define _ecp_nistz256_mul_by_2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_mul_by_2) #define _ecp_nistz256_mul_by_3 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_mul_by_3) #define _ecp_nistz256_mul_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_mul_mont) +#define _ecp_nistz256_mul_mont_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_mul_mont_adx) +#define _ecp_nistz256_mul_mont_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_mul_mont_nohw) #define _ecp_nistz256_neg BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_neg) #define _ecp_nistz256_ord_mul_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont) +#define _ecp_nistz256_ord_mul_mont_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont_adx) +#define _ecp_nistz256_ord_mul_mont_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont_nohw) #define _ecp_nistz256_ord_sqr_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont) +#define _ecp_nistz256_ord_sqr_mont_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont_adx) +#define _ecp_nistz256_ord_sqr_mont_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont_nohw) #define _ecp_nistz256_point_add BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add) +#define _ecp_nistz256_point_add_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add_adx) #define _ecp_nistz256_point_add_affine BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine) +#define _ecp_nistz256_point_add_affine_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine_adx) +#define _ecp_nistz256_point_add_affine_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine_nohw) +#define _ecp_nistz256_point_add_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add_nohw) #define _ecp_nistz256_point_double BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_double) +#define _ecp_nistz256_point_double_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_double_adx) +#define _ecp_nistz256_point_double_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_double_nohw) #define _ecp_nistz256_select_w5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w5) +#define _ecp_nistz256_select_w5_avx2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w5_avx2) +#define _ecp_nistz256_select_w5_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w5_nohw) #define _ecp_nistz256_select_w7 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w7) +#define _ecp_nistz256_select_w7_avx2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w7_avx2) +#define _ecp_nistz256_select_w7_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w7_nohw) #define _ecp_nistz256_sqr_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont) +#define _ecp_nistz256_sqr_mont_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont_adx) +#define _ecp_nistz256_sqr_mont_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont_nohw) #define _ecp_nistz256_sub BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_sub) #define _ed25519_asn1_meth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ed25519_asn1_meth) #define _ed25519_pkey_meth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ed25519_pkey_meth) +#define _evp_md_md5_sha1 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, evp_md_md5_sha1) +#define _evp_pkey_set_method BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, evp_pkey_set_method) #define _fiat_curve25519_adx_mul BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, fiat_curve25519_adx_mul) #define _fiat_curve25519_adx_square BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, fiat_curve25519_adx_square) #define _fiat_p256_adx_mul BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, fiat_p256_adx_mul) @@ -2834,8 +2905,6 @@ #define _spx_fors_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_fors_sign) #define _spx_fors_sk_gen BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_fors_sk_gen) #define _spx_fors_treehash BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_fors_treehash) -#define _spx_generate_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_generate_key) -#define _spx_generate_key_from_seed BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_generate_key_from_seed) #define _spx_get_tree_index BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_get_tree_index) #define _spx_ht_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_ht_sign) #define _spx_ht_verify BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_ht_verify) @@ -2847,7 +2916,6 @@ #define _spx_set_tree_height BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_set_tree_height) #define _spx_set_tree_index BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_set_tree_index) #define _spx_set_type BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_set_type) -#define _spx_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_sign) #define _spx_thash_f BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_thash_f) #define _spx_thash_h BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_thash_h) #define _spx_thash_hmsg BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_thash_hmsg) @@ -2858,12 +2926,12 @@ #define _spx_to_uint64 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_to_uint64) #define _spx_treehash BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_treehash) #define _spx_uint64_to_len_bytes BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_uint64_to_len_bytes) -#define _spx_verify BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_verify) #define _spx_wots_pk_from_sig BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_wots_pk_from_sig) #define _spx_wots_pk_gen BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_wots_pk_gen) #define _spx_wots_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_wots_sign) #define _spx_xmss_pk_from_sig BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_xmss_pk_from_sig) #define _spx_xmss_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_xmss_sign) +#define _swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE) #define _v2i_GENERAL_NAME BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, v2i_GENERAL_NAME) #define _v2i_GENERAL_NAMES BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, v2i_GENERAL_NAMES) #define _v2i_GENERAL_NAME_ex BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, v2i_GENERAL_NAME_ex) diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bytestring.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bytestring.h index 65f9618d..3e8092a9 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bytestring.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bytestring.h @@ -639,6 +639,9 @@ OPENSSL_EXPORT int CBB_flush_asn1_set_of(CBB *cbb); // Unicode utilities. +// +// These functions consider noncharacters (see section 23.7 from Unicode 15.0.0) +// to be invalid code points and will treat them as an error condition. // The following functions read one Unicode code point from |cbs| with the // corresponding encoding and store it in |*out|. They return one on success and @@ -653,7 +656,9 @@ OPENSSL_EXPORT int CBS_get_utf32_be(CBS *cbs, uint32_t *out); OPENSSL_EXPORT size_t CBB_get_utf8_len(uint32_t u); // The following functions encode |u| to |cbb| with the corresponding -// encoding. They return one on success and zero on error. +// encoding. They return one on success and zero on error. Error conditions +// include |u| being an invalid code point, or |u| being unencodable in the +// specified encoding. OPENSSL_EXPORT int CBB_add_utf8(CBB *cbb, uint32_t u); OPENSSL_EXPORT int CBB_add_latin1(CBB *cbb, uint32_t u); OPENSSL_EXPORT int CBB_add_ucs2_be(CBB *cbb, uint32_t u); diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_crypto.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_crypto.h index 15755064..f3cb46f0 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_crypto.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_crypto.h @@ -32,18 +32,9 @@ extern "C" { #endif -// crypto.h contains functions for initializing the crypto library. +// crypto.h contains functions for library-wide initialization and properties. -// CRYPTO_library_init initializes the crypto library. It must be called if the -// library is built with BORINGSSL_NO_STATIC_INITIALIZER. Otherwise, it does -// nothing and a static initializer is used instead. It is safe to call this -// function multiple times and concurrently from multiple threads. -// -// On some ARM configurations, this function may require filesystem access and -// should be called before entering a sandbox. -OPENSSL_EXPORT void CRYPTO_library_init(void); - // CRYPTO_is_confidential_build returns one if the linked version of BoringSSL // has been built with the BORINGSSL_CONFIDENTIAL define and zero otherwise. // @@ -164,7 +155,7 @@ OPENSSL_EXPORT void OPENSSL_load_builtin_modules(void); #define OPENSSL_INIT_NO_LOAD_CONFIG 0 #define OPENSSL_INIT_NO_ATEXIT 0 -// OPENSSL_init_crypto calls |CRYPTO_library_init| and returns one. +// OPENSSL_init_crypto returns one. OPENSSL_EXPORT int OPENSSL_init_crypto(uint64_t opts, const OPENSSL_INIT_SETTINGS *settings); @@ -178,6 +169,9 @@ OPENSSL_EXPORT int FIPS_mode_set(int on); // FIPS_module_name returns the name of the FIPS module. OPENSSL_EXPORT const char *FIPS_module_name(void); +// FIPS_module_hash returns the 32-byte hash of the FIPS module. +OPENSSL_EXPORT const uint8_t* FIPS_module_hash(void); + // FIPS_version returns the version of the FIPS module, or zero if the build // isn't exactly at a verified version. The version, expressed in base 10, will // be a date in the form yyyymmddXX where XX is often "00", but can be @@ -196,6 +190,10 @@ OPENSSL_EXPORT int FIPS_query_algorithm_status(const char *algorithm); OPENSSL_EXPORT int CRYPTO_has_broken_NEON(void); #endif +// CRYPTO_library_init does nothing. Historically, it was needed in some build +// configurations to initialization the library. This is no longer necessary. +OPENSSL_EXPORT void CRYPTO_library_init(void); + #if defined(__cplusplus) } // extern C diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dh.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dh.h index 96c1094f..f7399260 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dh.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dh.h @@ -75,6 +75,12 @@ extern "C" { // Allocation and destruction. +// +// A |DH| object represents a Diffie-Hellman key or group parameters. A given +// object may be used concurrently on multiple threads by non-mutating +// functions, provided no other thread is concurrently calling a mutating +// function. Unless otherwise documented, functions which take a |const| pointer +// are non-mutating and functions which take a non-|const| pointer are mutating. // DH_new returns a new, empty DH object or NULL on error. OPENSSL_EXPORT DH *DH_new(void); @@ -83,7 +89,8 @@ OPENSSL_EXPORT DH *DH_new(void); // count drops to zero. OPENSSL_EXPORT void DH_free(DH *dh); -// DH_up_ref increments the reference count of |dh| and returns one. +// DH_up_ref increments the reference count of |dh| and returns one. It does not +// mutate |dh| for thread-safety purposes and may be used concurrently. OPENSSL_EXPORT int DH_up_ref(DH *dh); @@ -214,6 +221,9 @@ OPENSSL_EXPORT int DH_generate_key(DH *dh); // Callers that expect a fixed-width secret should use this function over // |DH_compute_key|. Callers that use either function should migrate to a modern // primitive such as X25519 or ECDH with P-256 instead. +// +// This function does not mutate |dh| for thread-safety purposes and may be used +// concurrently. OPENSSL_EXPORT int DH_compute_key_padded(uint8_t *out, const BIGNUM *peers_key, DH *dh); @@ -225,6 +235,9 @@ OPENSSL_EXPORT int DH_compute_key_padded(uint8_t *out, const BIGNUM *peers_key, // // NOTE: this follows the usual BoringSSL return-value convention, but that's // different from |DH_compute_key| and |DH_compute_key_padded|. +// +// This function does not mutate |dh| for thread-safety purposes and may be used +// concurrently. OPENSSL_EXPORT int DH_compute_key_hashed(DH *dh, uint8_t *out, size_t *out_len, size_t max_out_len, const BIGNUM *peers_key, @@ -327,6 +340,9 @@ OPENSSL_EXPORT int i2d_DHparams(const DH *in, unsigned char **outp); // Callers that expect a fixed-width secret should use |DH_compute_key_padded| // instead. Callers that use either function should migrate to a modern // primitive such as X25519 or ECDH with P-256 instead. +// +// This function does not mutate |dh| for thread-safety purposes and may be used +// concurrently. OPENSSL_EXPORT int DH_compute_key(uint8_t *out, const BIGNUM *peers_key, DH *dh); diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dsa.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dsa.h index 077aaef1..a16f934b 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dsa.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dsa.h @@ -78,6 +78,12 @@ extern "C" { // Allocation and destruction. +// +// A |DSA| object represents a DSA key or group parameters. A given object may +// be used concurrently on multiple threads by non-mutating functions, provided +// no other thread is concurrently calling a mutating function. Unless otherwise +// documented, functions which take a |const| pointer are non-mutating and +// functions which take a non-|const| pointer are mutating. // DSA_new returns a new, empty DSA object or NULL on error. OPENSSL_EXPORT DSA *DSA_new(void); @@ -86,7 +92,8 @@ OPENSSL_EXPORT DSA *DSA_new(void); // reference count drops to zero. OPENSSL_EXPORT void DSA_free(DSA *dsa); -// DSA_up_ref increments the reference count of |dsa| and returns one. +// DSA_up_ref increments the reference count of |dsa| and returns one. It does +// not mutate |dsa| for thread-safety purposes and may be used concurrently. OPENSSL_EXPORT int DSA_up_ref(DSA *dsa); @@ -216,7 +223,7 @@ OPENSSL_EXPORT DSA_SIG *DSA_do_sign(const uint8_t *digest, size_t digest_len, // // TODO(fork): deprecate. OPENSSL_EXPORT int DSA_do_verify(const uint8_t *digest, size_t digest_len, - DSA_SIG *sig, const DSA *dsa); + const DSA_SIG *sig, const DSA *dsa); // DSA_do_check_signature sets |*out_valid| to zero. Then it verifies that |sig| // is a valid signature, by the public key in |dsa| of the hash in |digest| @@ -225,7 +232,7 @@ OPENSSL_EXPORT int DSA_do_verify(const uint8_t *digest, size_t digest_len, // It returns one if it was able to verify the signature as valid or invalid, // and zero on error. OPENSSL_EXPORT int DSA_do_check_signature(int *out_valid, const uint8_t *digest, - size_t digest_len, DSA_SIG *sig, + size_t digest_len, const DSA_SIG *sig, const DSA *dsa); diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp.h index 828bc8d3..fa8c5b54 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp.h @@ -136,10 +136,6 @@ OPENSSL_EXPORT int EVP_PKEY_bits(const EVP_PKEY *pkey); // values. OPENSSL_EXPORT int EVP_PKEY_id(const EVP_PKEY *pkey); -// EVP_PKEY_type returns |nid| if |nid| is a known key type and |NID_undef| -// otherwise. -OPENSSL_EXPORT int EVP_PKEY_type(int nid); - // Getting and setting concrete public key types. // @@ -171,6 +167,11 @@ OPENSSL_EXPORT int EVP_PKEY_assign_EC_KEY(EVP_PKEY *pkey, EC_KEY *key); OPENSSL_EXPORT EC_KEY *EVP_PKEY_get0_EC_KEY(const EVP_PKEY *pkey); OPENSSL_EXPORT EC_KEY *EVP_PKEY_get1_EC_KEY(const EVP_PKEY *pkey); +OPENSSL_EXPORT int EVP_PKEY_set1_DH(EVP_PKEY *pkey, DH *key); +OPENSSL_EXPORT int EVP_PKEY_assign_DH(EVP_PKEY *pkey, DH *key); +OPENSSL_EXPORT DH *EVP_PKEY_get0_DH(const EVP_PKEY *pkey); +OPENSSL_EXPORT DH *EVP_PKEY_get1_DH(const EVP_PKEY *pkey); + #define EVP_PKEY_NONE NID_undef #define EVP_PKEY_RSA NID_rsaEncryption #define EVP_PKEY_RSA_PSS NID_rsassaPss @@ -179,6 +180,7 @@ OPENSSL_EXPORT EC_KEY *EVP_PKEY_get1_EC_KEY(const EVP_PKEY *pkey); #define EVP_PKEY_ED25519 NID_ED25519 #define EVP_PKEY_X25519 NID_X25519 #define EVP_PKEY_HKDF NID_hkdf +#define EVP_PKEY_DH NID_dhKeyAgreement // EVP_PKEY_set_type sets the type of |pkey| to |type|. It returns one if // successful or zero if the |type| argument is not one of the |EVP_PKEY_*| @@ -814,11 +816,23 @@ OPENSSL_EXPORT int EVP_PKEY_CTX_set_ec_paramgen_curve_nid(EVP_PKEY_CTX *ctx, int nid); -// Deprecated functions. +// Diffie-Hellman-specific control functions. -// EVP_PKEY_DH is defined for compatibility, but it is impossible to create an -// |EVP_PKEY| of that type. -#define EVP_PKEY_DH NID_dhKeyAgreement +// EVP_PKEY_CTX_set_dh_pad configures configures whether |ctx|, which must be an +// |EVP_PKEY_derive| operation, configures the handling of leading zeros in the +// Diffie-Hellman shared secret. If |pad| is zero, leading zeros are removed +// from the secret. If |pad| is non-zero, the fixed-width shared secret is used +// unmodified, as in PKCS #3. If this function is not called, the default is to +// remove leading zeros. +// +// WARNING: The behavior when |pad| is zero leaks information about the shared +// secret. This may result in side channel attacks such as +// https://raccoon-attack.com/, particularly when the same private key is used +// for multiple operations. +OPENSSL_EXPORT int EVP_PKEY_CTX_set_dh_pad(EVP_PKEY_CTX *ctx, int pad); + + +// Deprecated functions. // EVP_PKEY_RSA2 was historically an alternate form for RSA public keys (OID // 2.5.8.1.1), but is no longer accepted. @@ -917,12 +931,6 @@ OPENSSL_EXPORT EVP_PKEY *d2i_AutoPrivateKey(EVP_PKEY **out, const uint8_t **inp, OPENSSL_EXPORT EVP_PKEY *d2i_PublicKey(int type, EVP_PKEY **out, const uint8_t **inp, long len); -// EVP_PKEY_get0_DH returns NULL. -OPENSSL_EXPORT DH *EVP_PKEY_get0_DH(const EVP_PKEY *pkey); - -// EVP_PKEY_get1_DH returns NULL. -OPENSSL_EXPORT DH *EVP_PKEY_get1_DH(const EVP_PKEY *pkey); - // EVP_PKEY_CTX_set_ec_param_enc returns one if |encoding| is // |OPENSSL_EC_NAMED_CURVE| or zero with an error otherwise. OPENSSL_EXPORT int EVP_PKEY_CTX_set_ec_param_enc(EVP_PKEY_CTX *ctx, @@ -1036,6 +1044,9 @@ OPENSSL_EXPORT int EVP_PKEY_CTX_set_dsa_paramgen_q_bits(EVP_PKEY_CTX *ctx, // Use the |EVP_PKEY_assign_*| functions instead. OPENSSL_EXPORT int EVP_PKEY_assign(EVP_PKEY *pkey, int type, void *key); +// EVP_PKEY_type returns |nid|. +OPENSSL_EXPORT int EVP_PKEY_type(int nid); + // Preprocessor compatibility section (hidden). // diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp_errors.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp_errors.h index 8583f521..163f17e2 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp_errors.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp_errors.h @@ -95,5 +95,6 @@ #define EVP_R_NOT_XOF_OR_INVALID_LENGTH 135 #define EVP_R_EMPTY_PSK 136 #define EVP_R_INVALID_BUFFER_SIZE 137 +#define EVP_R_EXPECTING_A_DH_KEY 138 #endif // OPENSSL_HEADER_EVP_ERRORS_H diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_hpke.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_hpke.h index 32a942f9..caf048c0 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_hpke.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_hpke.h @@ -40,12 +40,14 @@ extern "C" { // respectively. // The following constants are KEM identifiers. +#define EVP_HPKE_DHKEM_P256_HKDF_SHA256 0x0010 #define EVP_HPKE_DHKEM_X25519_HKDF_SHA256 0x0020 // The following functions are KEM algorithms which may be used with HPKE. Note // that, while some HPKE KEMs use KDFs internally, this is separate from the // |EVP_HPKE_KDF| selection. OPENSSL_EXPORT const EVP_HPKE_KEM *EVP_hpke_x25519_hkdf_sha256(void); +OPENSSL_EXPORT const EVP_HPKE_KEM *EVP_hpke_p256_hkdf_sha256(void); // EVP_HPKE_KEM_id returns the HPKE KEM identifier for |kem|, which // will be one of the |EVP_HPKE_KEM_*| constants. @@ -53,7 +55,7 @@ OPENSSL_EXPORT uint16_t EVP_HPKE_KEM_id(const EVP_HPKE_KEM *kem); // EVP_HPKE_MAX_PUBLIC_KEY_LENGTH is the maximum length of an encoded public key // for all KEMs currently supported by this library. -#define EVP_HPKE_MAX_PUBLIC_KEY_LENGTH 32 +#define EVP_HPKE_MAX_PUBLIC_KEY_LENGTH 65 // EVP_HPKE_KEM_public_key_len returns the length of a public key for |kem|. // This value will be at most |EVP_HPKE_MAX_PUBLIC_KEY_LENGTH|. @@ -69,7 +71,7 @@ OPENSSL_EXPORT size_t EVP_HPKE_KEM_private_key_len(const EVP_HPKE_KEM *kem); // EVP_HPKE_MAX_ENC_LENGTH is the maximum length of "enc", the encapsulated // shared secret, for all KEMs currently supported by this library. -#define EVP_HPKE_MAX_ENC_LENGTH 32 +#define EVP_HPKE_MAX_ENC_LENGTH 65 // EVP_HPKE_KEM_enc_len returns the length of the "enc", the encapsulated shared // secret, for |kem|. This value will be at most |EVP_HPKE_MAX_ENC_LENGTH|. @@ -233,7 +235,7 @@ OPENSSL_EXPORT int EVP_HPKE_CTX_setup_sender( // EVP_HPKE_CTX_setup_sender_with_seed_for_testing behaves like // |EVP_HPKE_CTX_setup_sender|, but takes a seed to behave deterministically. // The seed's format depends on |kem|. For X25519, it is the sender's -// ephemeral private key. +// ephemeral private key. For P256, it's an HKDF input. OPENSSL_EXPORT int EVP_HPKE_CTX_setup_sender_with_seed_for_testing( EVP_HPKE_CTX *ctx, uint8_t *out_enc, size_t *out_enc_len, size_t max_enc, const EVP_HPKE_KEM *kem, const EVP_HPKE_KDF *kdf, const EVP_HPKE_AEAD *aead, @@ -265,7 +267,7 @@ OPENSSL_EXPORT int EVP_HPKE_CTX_setup_auth_sender( // EVP_HPKE_CTX_setup_auth_sender_with_seed_for_testing behaves like // |EVP_HPKE_CTX_setup_auth_sender|, but takes a seed to behave // deterministically. The seed's format depends on |kem|. For X25519, it is the -// sender's ephemeral private key. +// sender's ephemeral private key. For P256, it's an HKDF input. OPENSSL_EXPORT int EVP_HPKE_CTX_setup_auth_sender_with_seed_for_testing( EVP_HPKE_CTX *ctx, uint8_t *out_enc, size_t *out_enc_len, size_t max_enc, const EVP_HPKE_KEY *key, const EVP_HPKE_KDF *kdf, const EVP_HPKE_AEAD *aead, @@ -375,8 +377,8 @@ struct evp_hpke_ctx_st { struct evp_hpke_key_st { const EVP_HPKE_KEM *kem; - uint8_t private_key[X25519_PRIVATE_KEY_LEN]; - uint8_t public_key[X25519_PUBLIC_VALUE_LEN]; + uint8_t private_key[EVP_HPKE_MAX_PRIVATE_KEY_LENGTH]; + uint8_t public_key[EVP_HPKE_MAX_PUBLIC_KEY_LENGTH]; }; diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mldsa.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mldsa.h new file mode 100644 index 00000000..80a70307 --- /dev/null +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mldsa.h @@ -0,0 +1,136 @@ +/* Copyright (c) 2024, Google LLC + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_MLDSA_H_ +#define OPENSSL_HEADER_MLDSA_H_ + +#include "CCryptoBoringSSL_base.h" + +#if defined(__cplusplus) +extern "C" { +#endif + + +// ML-DSA-65. +// +// This implements the Module-Lattice-Based Digital Signature Standard from +// https://csrc.nist.gov/pubs/fips/204/final + + +// MLDSA65_private_key contains an ML-DSA-65 private key. The contents of this +// object should never leave the address space since the format is unstable. +struct MLDSA65_private_key { + union { + uint8_t bytes[32 + 32 + 64 + 256 * 4 * (5 + 6 + 6)]; + uint32_t alignment; + } opaque; +}; + +// MLDSA65_public_key contains an ML-DSA-65 public key. The contents of this +// object should never leave the address space since the format is unstable. +struct MLDSA65_public_key { + union { + uint8_t bytes[32 + 64 + 256 * 4 * 6]; + uint32_t alignment; + } opaque; +}; + +// MLDSA65_PRIVATE_KEY_BYTES is the number of bytes in an encoded ML-DSA-65 +// private key. +#define MLDSA65_PRIVATE_KEY_BYTES 4032 + +// MLDSA65_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-DSA-65 +// public key. +#define MLDSA65_PUBLIC_KEY_BYTES 1952 + +// MLDSA65_SIGNATURE_BYTES is the number of bytes in an encoded ML-DSA-65 +// signature. +#define MLDSA65_SIGNATURE_BYTES 3309 + +// MLDSA_SEED_BYTES is the number of bytes in an ML-DSA seed value. +#define MLDSA_SEED_BYTES 32 + +// MLDSA65_generate_key generates a random public/private key pair, writes the +// encoded public key to |out_encoded_public_key|, writes the seed to +// |out_seed|, and sets |out_private_key| to the private key. Returns 1 on +// success and 0 on allocation failure. +OPENSSL_EXPORT int MLDSA65_generate_key( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], + struct MLDSA65_private_key *out_private_key); + +// MLDSA65_private_key_from_seed regenerates a private key from a seed value +// that was generated by |MLDSA65_generate_key|. Returns 1 on success and 0 on +// allocation failure or if |seed_len| is incorrect. +OPENSSL_EXPORT int MLDSA65_private_key_from_seed( + struct MLDSA65_private_key *out_private_key, const uint8_t *seed, + size_t seed_len); + +// MLDSA65_public_from_private sets |*out_public_key| to the public key that +// corresponds to |private_key|. Returns 1 on success and 0 on failure. +OPENSSL_EXPORT int MLDSA65_public_from_private( + struct MLDSA65_public_key *out_public_key, + const struct MLDSA65_private_key *private_key); + +// MLDSA65_sign generates a signature for the message |msg| of length +// |msg_len| using |private_key| (following the randomized algorithm), and +// writes the encoded signature to |out_encoded_signature|. The |context| +// argument is also signed over and can be used to include implicit contextual +// information that isn't included in |msg|. The same value of |context| must be +// presented to |MLDSA65_verify| in order for the generated signature to be +// considered valid. |context| and |context_len| may be |NULL| and 0 to use an +// empty context (this is common). Returns 1 on success and 0 on failure. +OPENSSL_EXPORT int MLDSA65_sign( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const struct MLDSA65_private_key *private_key, const uint8_t *msg, + size_t msg_len, const uint8_t *context, size_t context_len); + +// MLDSA65_verify verifies that |signature| constitutes a valid +// signature for the message |msg| of length |msg_len| using |public_key|. The +// value of |context| must equal the value that was passed to |MLDSA65_sign| +// when the signature was generated. Returns 1 on success or 0 on error. +OPENSSL_EXPORT int MLDSA65_verify(const struct MLDSA65_public_key *public_key, + const uint8_t *signature, + size_t signature_len, const uint8_t *msg, + size_t msg_len, const uint8_t *context, + size_t context_len); + + +// Serialisation of keys. + +// MLDSA65_marshal_public_key serializes |public_key| to |out| in the standard +// format for ML-DSA-65 public keys. It returns 1 on success or 0 on +// allocation error. +OPENSSL_EXPORT int MLDSA65_marshal_public_key( + CBB *out, const struct MLDSA65_public_key *public_key); + +// MLDSA65_parse_public_key parses a public key, in the format generated by +// |MLDSA65_marshal_public_key|, from |in| and writes the result to +// |out_public_key|. It returns 1 on success or 0 on parse error or if +// there are trailing bytes in |in|. +OPENSSL_EXPORT int MLDSA65_parse_public_key( + struct MLDSA65_public_key *public_key, CBS *in); + +// MLDSA65_parse_private_key parses a private key, in the NIST format, from |in| +// and writes the result to |out_private_key|. It returns 1 on success or 0 on +// parse error or if there are trailing bytes in |in|. +OPENSSL_EXPORT int MLDSA65_parse_private_key( + struct MLDSA65_private_key *private_key, CBS *in); + + +#if defined(__cplusplus) +} // extern C +#endif + +#endif // OPENSSL_HEADER_MLDSA_H_ diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mlkem.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mlkem.h new file mode 100644 index 00000000..4472aa39 --- /dev/null +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mlkem.h @@ -0,0 +1,246 @@ +/* Copyright (c) 2024, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_MLKEM_H +#define OPENSSL_HEADER_MLKEM_H + +#include "CCryptoBoringSSL_base.h" + +#if defined(__cplusplus) +extern "C" { +#endif + + +// ML-KEM-768. +// +// This implements the Module-Lattice-Based Key-Encapsulation Mechanism from +// https://csrc.nist.gov/pubs/fips/204/final + + +// MLKEM768_public_key contains an ML-KEM-768 public key. The contents of this +// object should never leave the address space since the format is unstable. +struct MLKEM768_public_key { + union { + uint8_t bytes[512 * (3 + 9) + 32 + 32]; + uint16_t alignment; + } opaque; +}; + +// MLKEM768_private_key contains an ML-KEM-768 private key. The contents of this +// object should never leave the address space since the format is unstable. +struct MLKEM768_private_key { + union { + uint8_t bytes[512 * (3 + 3 + 9) + 32 + 32 + 32]; + uint16_t alignment; + } opaque; +}; + +// MLKEM768_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-KEM-768 +// public key. +#define MLKEM768_PUBLIC_KEY_BYTES 1184 + +// MLKEM_SEED_BYTES is the number of bytes in an ML-KEM seed. +#define MLKEM_SEED_BYTES 64 + +// MLKEM768_generate_key generates a random public/private key pair, writes the +// encoded public key to |out_encoded_public_key| and sets |out_private_key| to +// the private key. If |optional_out_seed| is not NULL then the seed used to +// generate the private key is written to it. +OPENSSL_EXPORT void MLKEM768_generate_key( + uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + struct MLKEM768_private_key *out_private_key); + +// MLKEM768_private_key_from_seed derives a private key from a seed that was +// generated by |MLKEM768_generate_key|. It fails and returns 0 if |seed_len| is +// incorrect, otherwise it writes |*out_private_key| and returns 1. +OPENSSL_EXPORT int MLKEM768_private_key_from_seed( + struct MLKEM768_private_key *out_private_key, const uint8_t *seed, + size_t seed_len); + +// MLKEM768_public_from_private sets |*out_public_key| to the public key that +// corresponds to |private_key|. (This is faster than parsing the output of +// |MLKEM768_generate_key| if, for some reason, you need to encapsulate to a key +// that was just generated.) +OPENSSL_EXPORT void MLKEM768_public_from_private( + struct MLKEM768_public_key *out_public_key, + const struct MLKEM768_private_key *private_key); + +// MLKEM768_CIPHERTEXT_BYTES is number of bytes in the ML-KEM-768 ciphertext. +#define MLKEM768_CIPHERTEXT_BYTES 1088 + +// MLKEM_SHARED_SECRET_BYTES is the number of bytes in an ML-KEM shared secret. +#define MLKEM_SHARED_SECRET_BYTES 32 + +// MLKEM768_encap encrypts a random shared secret for |public_key|, writes the +// ciphertext to |out_ciphertext|, and writes the random shared secret to +// |out_shared_secret|. +OPENSSL_EXPORT void MLKEM768_encap( + uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const struct MLKEM768_public_key *public_key); + +// MLKEM768_decap decrypts a shared secret from |ciphertext| using |private_key| +// and writes it to |out_shared_secret|. If |ciphertext_len| is incorrect it +// returns 0, otherwise it returns 1. If |ciphertext| is invalid (but of the +// correct length), |out_shared_secret| is filled with a key that will always be +// the same for the same |ciphertext| and |private_key|, but which appears to be +// random unless one has access to |private_key|. These alternatives occur in +// constant time. Any subsequent symmetric encryption using |out_shared_secret| +// must use an authenticated encryption scheme in order to discover the +// decapsulation failure. +OPENSSL_EXPORT int MLKEM768_decap( + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const uint8_t *ciphertext, size_t ciphertext_len, + const struct MLKEM768_private_key *private_key); + + +// Serialisation of keys. + +// MLKEM768_marshal_public_key serializes |public_key| to |out| in the standard +// format for ML-KEM-768 public keys. It returns one on success or zero on +// allocation error. +OPENSSL_EXPORT int MLKEM768_marshal_public_key( + CBB *out, const struct MLKEM768_public_key *public_key); + +// MLKEM768_parse_public_key parses a public key, in the format generated by +// |MLKEM768_marshal_public_key|, from |in| and writes the result to +// |out_public_key|. It returns one on success or zero on parse error or if +// there are trailing bytes in |in|. +OPENSSL_EXPORT int MLKEM768_parse_public_key( + struct MLKEM768_public_key *out_public_key, CBS *in); + +// MLKEM768_PRIVATE_KEY_BYTES is the length of the data produced by +// |MLKEM768_marshal_private_key|. +#define MLKEM768_PRIVATE_KEY_BYTES 2400 + +// MLKEM768_parse_private_key parses a private key, in NIST's format for +// private keys, from |in| and writes the result to |out_private_key|. It +// returns one on success or zero on parse error or if there are trailing bytes +// in |in|. This format is verbose and should be avoided. Private keys should be +// stored as seeds and parsed using |MLKEM768_private_key_from_seed|. +OPENSSL_EXPORT int MLKEM768_parse_private_key( + struct MLKEM768_private_key *out_private_key, CBS *in); + + +// ML-KEM-1024 +// +// ML-KEM-1024 also exists. You should prefer ML-KEM-768 where possible. + +// MLKEM1024_public_key contains an ML-KEM-1024 public key. The contents of this +// object should never leave the address space since the format is unstable. +struct MLKEM1024_public_key { + union { + uint8_t bytes[512 * (4 + 16) + 32 + 32]; + uint16_t alignment; + } opaque; +}; + +// MLKEM1024_private_key contains a ML-KEM-1024 private key. The contents of +// this object should never leave the address space since the format is +// unstable. +struct MLKEM1024_private_key { + union { + uint8_t bytes[512 * (4 + 4 + 16) + 32 + 32 + 32]; + uint16_t alignment; + } opaque; +}; + +// MLKEM1024_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-KEM-1024 +// public key. +#define MLKEM1024_PUBLIC_KEY_BYTES 1568 + +// MLKEM1024_generate_key generates a random public/private key pair, writes the +// encoded public key to |out_encoded_public_key| and sets |out_private_key| to +// the private key. If |optional_out_seed| is not NULL then the seed used to +// generate the private key is written to it. +OPENSSL_EXPORT void MLKEM1024_generate_key( + uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + struct MLKEM1024_private_key *out_private_key); + +// MLKEM1024_private_key_from_seed derives a private key from a seed that was +// generated by |MLKEM1024_generate_key|. It fails and returns 0 if |seed_len| +// is incorrect, otherwise it writes |*out_private_key| and returns 1. +OPENSSL_EXPORT int MLKEM1024_private_key_from_seed( + struct MLKEM1024_private_key *out_private_key, const uint8_t *seed, + size_t seed_len); + +// MLKEM1024_public_from_private sets |*out_public_key| to the public key that +// corresponds to |private_key|. (This is faster than parsing the output of +// |MLKEM1024_generate_key| if, for some reason, you need to encapsulate to a +// key that was just generated.) +OPENSSL_EXPORT void MLKEM1024_public_from_private( + struct MLKEM1024_public_key *out_public_key, + const struct MLKEM1024_private_key *private_key); + +// MLKEM1024_CIPHERTEXT_BYTES is number of bytes in the ML-KEM-1024 ciphertext. +#define MLKEM1024_CIPHERTEXT_BYTES 1568 + +// MLKEM1024_encap encrypts a random shared secret for |public_key|, writes the +// ciphertext to |out_ciphertext|, and writes the random shared secret to +// |out_shared_secret|. +OPENSSL_EXPORT void MLKEM1024_encap( + uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const struct MLKEM1024_public_key *public_key); + +// MLKEM1024_decap decrypts a shared secret from |ciphertext| using +// |private_key| and writes it to |out_shared_secret|. If |ciphertext_len| is +// incorrect it returns 0, otherwise it returns 1. If |ciphertext| is invalid +// (but of the correct length), |out_shared_secret| is filled with a key that +// will always be the same for the same |ciphertext| and |private_key|, but +// which appears to be random unless one has access to |private_key|. These +// alternatives occur in constant time. Any subsequent symmetric encryption +// using |out_shared_secret| must use an authenticated encryption scheme in +// order to discover the decapsulation failure. +OPENSSL_EXPORT int MLKEM1024_decap( + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const uint8_t *ciphertext, size_t ciphertext_len, + const struct MLKEM1024_private_key *private_key); + + +// Serialisation of ML-KEM-1024 keys. + +// MLKEM1024_marshal_public_key serializes |public_key| to |out| in the standard +// format for ML-KEM-1024 public keys. It returns one on success or zero on +// allocation error. +OPENSSL_EXPORT int MLKEM1024_marshal_public_key( + CBB *out, const struct MLKEM1024_public_key *public_key); + +// MLKEM1024_parse_public_key parses a public key, in the format generated by +// |MLKEM1024_marshal_public_key|, from |in| and writes the result to +// |out_public_key|. It returns one on success or zero on parse error or if +// there are trailing bytes in |in|. +OPENSSL_EXPORT int MLKEM1024_parse_public_key( + struct MLKEM1024_public_key *out_public_key, CBS *in); + +// MLKEM1024_PRIVATE_KEY_BYTES is the length of the data produced by +// |MLKEM1024_marshal_private_key|. +#define MLKEM1024_PRIVATE_KEY_BYTES 3168 + +// MLKEM1024_parse_private_key parses a private key, in NIST's format for +// private keys, from |in| and writes the result to |out_private_key|. It +// returns one on success or zero on parse error or if there are trailing bytes +// in |in|. This format is verbose and should be avoided. Private keys should be +// stored as seeds and parsed using |MLKEM1024_private_key_from_seed|. +OPENSSL_EXPORT int MLKEM1024_parse_private_key( + struct MLKEM1024_private_key *out_private_key, CBS *in); + + +#if defined(__cplusplus) +} // extern C +#endif + +#endif // OPENSSL_HEADER_MLKEM_H diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_nid.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_nid.h index a1c03a07..521adfe6 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_nid.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_nid.h @@ -4255,6 +4255,9 @@ extern "C" { #define SN_X25519Kyber768Draft00 "X25519Kyber768Draft00" #define NID_X25519Kyber768Draft00 964 +#define SN_X25519MLKEM768 "X25519MLKEM768" +#define NID_X25519MLKEM768 965 + #if defined(__cplusplus) } /* extern C */ diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_pem.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_pem.h index b93b284c..443068a6 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_pem.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_pem.h @@ -385,10 +385,9 @@ OPENSSL_EXPORT int PEM_ASN1_write(i2d_of_void *i2d, const char *name, FILE *fp, pem_password_cb *callback, void *u); // PEM_def_callback treats |userdata| as a string and copies it into |buf|, -// assuming its |size| is sufficient. Returns the length of the string, or 0 -// if there is not enough room. If either |buf| or |userdata| is NULL, 0 is -// returned. Note that this is different from OpenSSL, which prompts for a -// password. +// assuming its |size| is sufficient. Returns the length of the string, or -1 on +// error. Error cases the buffer being too small, or |buf| and |userdata| being +// NULL. Note that this is different from OpenSSL, which prompts for a password. OPENSSL_EXPORT int PEM_def_callback(char *buf, int size, int rwflag, void *userdata); diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_rand.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_rand.h index fcac8f2c..5ca7fe94 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_rand.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_rand.h @@ -33,20 +33,29 @@ OPENSSL_EXPORT int RAND_bytes(uint8_t *buf, size_t len); // Obscure functions. #if !defined(OPENSSL_WINDOWS) -// RAND_enable_fork_unsafe_buffering enables efficient buffered reading of -// /dev/urandom. It adds an overhead of a few KB of memory per thread. It must -// be called before the first call to |RAND_bytes|. +// RAND_enable_fork_unsafe_buffering indicates that clones of the address space, +// e.g. via |fork|, will never call into BoringSSL. It may be used to disable +// BoringSSL's more expensive fork-safety measures. However, calling this +// function and then using BoringSSL across |fork| calls will leak secret keys. +// |fd| must be -1. // -// |fd| must be -1. We no longer support setting the file descriptor with this -// function. +// WARNING: This function affects BoringSSL for the entire address space. Thus +// this function should never be called by library code, only by code with +// global knowledge of the application's use of BoringSSL. // -// It has an unusual name because the buffer is unsafe across calls to |fork|. -// Hence, this function should never be called by libraries. +// Do not use this function unless a performance issue was measured with the +// default behavior. BoringSSL can efficiently detect forks on most platforms, +// in which case this function is a no-op and is unnecessary. In particular, +// Linux kernel versions 4.14 or later provide |MADV_WIPEONFORK|. Future +// versions of BoringSSL will remove this functionality when older kernels are +// sufficiently rare. +// +// This function has an unusual name because it historically controlled internal +// buffers, but no longer does. OPENSSL_EXPORT void RAND_enable_fork_unsafe_buffering(int fd); -// RAND_disable_fork_unsafe_buffering disables efficient buffered reading of -// /dev/urandom, causing BoringSSL to always draw entropy on every request -// for random bytes. +// RAND_disable_fork_unsafe_buffering restores BoringSSL's default fork-safety +// protections. See also |RAND_enable_fork_unsafe_buffering|. OPENSSL_EXPORT void RAND_disable_fork_unsafe_buffering(void); #endif diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_service_indicator.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_service_indicator.h index f3a0c1cc..87625dc6 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_service_indicator.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_service_indicator.h @@ -56,7 +56,7 @@ extern "C++" { return func; \ }() -namespace bssl { +BSSL_NAMESPACE_BEGIN enum class FIPSStatus { NOT_APPROVED = 0, @@ -87,7 +87,7 @@ class FIPSIndicatorHelper { const uint64_t before_; }; -} // namespace bssl +BSSL_NAMESPACE_END } // extern "C++" #endif // !BORINGSSL_NO_CXX diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_sha.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_sha.h index 471bda13..dff93f36 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_sha.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_sha.h @@ -58,6 +58,7 @@ #define OPENSSL_HEADER_SHA_H #include "CCryptoBoringSSL_base.h" +#include "CCryptoBoringSSL_bcm_public.h" // IWYU pragma: export #if defined(__cplusplus) extern "C" { @@ -112,29 +113,6 @@ OPENSSL_EXPORT void SHA1_Transform(SHA_CTX *sha, OPENSSL_EXPORT void CRYPTO_fips_186_2_prf( uint8_t *out, size_t out_len, const uint8_t xkey[SHA_DIGEST_LENGTH]); -struct sha_state_st { -#if defined(__cplusplus) || defined(OPENSSL_WINDOWS) - uint32_t h[5]; -#else - // wpa_supplicant accesses |h0|..|h4| so we must support those names for - // compatibility with it until it can be updated. Anonymous unions are only - // standard in C11, so disable this workaround in C++. - union { - uint32_t h[5]; - struct { - uint32_t h0; - uint32_t h1; - uint32_t h2; - uint32_t h3; - uint32_t h4; - }; - }; -#endif - uint32_t Nl, Nh; - uint8_t data[SHA_CBLOCK]; - unsigned num; -}; - // SHA-224. diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_span.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_span.h index aa9396a7..90ee7bb1 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_span.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_span.h @@ -30,6 +30,28 @@ extern "C++" { #include #endif +#if defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 201911L +#include +BSSL_NAMESPACE_BEGIN +template +class Span; +BSSL_NAMESPACE_END + +// Mark `Span` as satisfying the `view` and `borrowed_range` concepts. This +// should be done before the definition of `Span`, so that any inlined calls to +// range functionality use the correct specializations. +template +inline constexpr bool std::ranges::enable_view> = true; +template +inline constexpr bool std::ranges::enable_borrowed_range> = true; +#endif + BSSL_NAMESPACE_BEGIN template @@ -49,6 +71,16 @@ class SpanBase { friend bool operator!=(Span lhs, Span rhs) { return !(lhs == rhs); } }; + +// Heuristically test whether C is a container type that can be converted into +// a Span by checking for data() and size() member functions. +// +// TODO(davidben): Require C++17 support for std::is_convertible_v, etc. +template +using EnableIfContainer = std::enable_if_t< + std::is_convertible().data()), T *>::value && + std::is_integral().size())>::value>; + } // namespace internal // A Span is a non-owning reference to a contiguous array of objects of type @@ -84,16 +116,6 @@ class SpanBase { // a reference or pointer to a container or array. template class Span : private internal::SpanBase { - private: - // Heuristically test whether C is a container type that can be converted into - // a Span by checking for data() and size() member functions. - // - // TODO(davidben): Require C++17 support for std::is_convertible_v, etc. - template - using EnableIfContainer = std::enable_if_t< - std::is_convertible().data()), T *>::value && - std::is_integral().size())>::value>; - public: static const size_t npos = static_cast(-1); @@ -114,12 +136,12 @@ class Span : private internal::SpanBase { template constexpr Span(T (&array)[N]) : Span(array, N) {} - template , + template , typename = std::enable_if_t::value, C>> constexpr Span(const C &container) : data_(container.data()), size_(container.size()) {} - template , + template , typename = std::enable_if_t::value, C>> constexpr explicit Span(C &container) : data_(container.data()), size_(container.size()) {} @@ -188,6 +210,20 @@ class Span : private internal::SpanBase { template const size_t Span::npos; +#if __cplusplus >= 201703L +template +Span(T *, size_t) -> Span; +template +Span(T (&array)[size]) -> Span; +template < + typename C, + typename T = std::remove_pointer_t().data())>, + typename = internal::EnableIfContainer> +Span(C &) -> Span; +#endif + +// C++17 callers can instead rely on CTAD and the deduction guides defined +// above. template constexpr Span MakeSpan(T *ptr, size_t size) { return Span(ptr, size); diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_stack.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_stack.h index 879be51b..be689841 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_stack.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_stack.h @@ -139,7 +139,8 @@ STACK_OF(SAMPLE) *sk_SAMPLE_new(sk_SAMPLE_cmp_func comp); STACK_OF(SAMPLE) *sk_SAMPLE_new_null(void); // sk_SAMPLE_num returns the number of elements in |sk|. It is safe to cast this -// value to |int|. |sk| is guaranteed to have at most |INT_MAX| elements. +// value to |int|. |sk| is guaranteed to have at most |INT_MAX| elements. If +// |sk| is NULL, it is treated as the empty list and this function returns zero. size_t sk_SAMPLE_num(const STACK_OF(SAMPLE) *sk); // sk_SAMPLE_zero resets |sk| to the empty state but does nothing to free the @@ -147,7 +148,8 @@ size_t sk_SAMPLE_num(const STACK_OF(SAMPLE) *sk); void sk_SAMPLE_zero(STACK_OF(SAMPLE) *sk); // sk_SAMPLE_value returns the |i|th pointer in |sk|, or NULL if |i| is out of -// range. +// range. If |sk| is NULL, it is treated as an empty list and the function +// returns NULL. SAMPLE *sk_SAMPLE_value(const STACK_OF(SAMPLE) *sk, size_t i); // sk_SAMPLE_set sets the |i|th pointer in |sk| to |p| and returns |p|. If |i| @@ -195,7 +197,8 @@ void sk_SAMPLE_delete_if(STACK_OF(SAMPLE) *sk, sk_SAMPLE_delete_if_func func, // If the stack is sorted (see |sk_SAMPLE_sort|), this function uses a binary // search. Otherwise it performs a linear search. If it finds a matching // element, it writes the index to |*out_index| (if |out_index| is not NULL) and -// returns one. Otherwise, it returns zero. +// returns one. Otherwise, it returns zero. If |sk| is NULL, it is treated as +// the empty list and the function returns zero. // // Note this differs from OpenSSL. The type signature is slightly different, and // OpenSSL's version will implicitly sort |sk| if it has a comparison function @@ -399,6 +402,9 @@ BSSL_NAMESPACE_END * positive warning. */ \ OPENSSL_MSVC_PRAGMA(warning(push)) \ OPENSSL_MSVC_PRAGMA(warning(disable : 4191)) \ + OPENSSL_CLANG_PRAGMA("clang diagnostic push") \ + OPENSSL_CLANG_PRAGMA("clang diagnostic ignored \"-Wunknown-warning-option\"") \ + OPENSSL_CLANG_PRAGMA("clang diagnostic ignored \"-Wcast-function-type-strict\"") \ \ DECLARE_STACK_OF(name) \ \ @@ -534,6 +540,7 @@ BSSL_NAMESPACE_END (OPENSSL_sk_free_func)free_func); \ } \ \ + OPENSSL_CLANG_PRAGMA("clang diagnostic pop") \ OPENSSL_MSVC_PRAGMA(warning(pop)) diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_target.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_target.h index 29b1dc61..2760f52c 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_target.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_target.h @@ -84,18 +84,18 @@ // Trusty and Android baremetal aren't Linux but currently define __linux__. // As a workaround, we exclude them here. -// We also exclude nanolibc/CrOS EC/Zephyr. nanolibc/CrOS EC/Zephyr -// sometimes build for a non-Linux target (which should not define __linux__), -// but also sometimes build for Linux. Although technically running in Linux -// userspace, this lacks all the libc APIs we'd normally expect on Linux, so we -// treat it as a non-Linux target. +// We also exclude nanolibc/CrOS EC. nanolibc/CrOS EC sometimes build for a +// non-Linux target (which should not define __linux__), but also sometimes +// build for Linux. Although technically running in Linux userspace, this lacks +// all the libc APIs we'd normally expect on Linux, so we treat it as a +// non-Linux target. // // TODO(b/169780122): Remove this workaround once Trusty no longer defines it. // TODO(b/291101350): Remove this workaround once Android baremetal no longer // defines it. #if defined(__linux__) && !defined(__TRUSTY__) && \ !defined(ANDROID_BAREMETAL) && !defined(OPENSSL_NANOLIBC) && \ - !defined(CROS_EC) && !defined(CROS_ZEPHYR) + !defined(CROS_EC) #define OPENSSL_LINUX #endif @@ -148,16 +148,19 @@ #define OPENSSL_NO_THREADS_CORRUPT_MEMORY_AND_LEAK_SECRETS_IF_THREADED #endif -// CROS_ZEPHYR is an embedded target for ChromeOS Zephyr Embedded Controller. +// Zephyr is an open source RTOS, optimized for embedded devices. // Defining this on any other platform is not supported. Other embedded // platforms must introduce their own defines. // -// https://chromium.googlesource.com/chromiumos/platform/ec/+/HEAD/docs/zephyr/README.md -#if defined(CROS_ZEPHYR) +// Zephyr supports multithreading with cooperative and preemptive scheduling. +// It also implements POSIX Threads (pthread) API, so it's not necessary to +// implement BoringSSL internal threading API using some custom API. +// +// https://www.zephyrproject.org/ +#if defined(__ZEPHYR__) #define OPENSSL_NO_FILESYSTEM #define OPENSSL_NO_POSIX_IO #define OPENSSL_NO_SOCK -#define OPENSSL_NO_THREADS_CORRUPT_MEMORY_AND_LEAK_SECRETS_IF_THREADED #endif #if defined(__ANDROID_API__) diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_x509.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_x509.h index 202a42af..fe832502 100644 --- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_x509.h +++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_x509.h @@ -254,9 +254,9 @@ OPENSSL_EXPORT void X509_get0_uids(const X509 *x509, // should not be accepted. #define EXFLAG_CRITICAL 0x200 // EXFLAG_SS indicates the certificate is likely self-signed. That is, if it is -// self-issued, its authority key identifer (if any) matches itself, and its key -// usage extension (if any) allows certificate signatures. The signature itself -// is not checked in computing this bit. +// self-issued, its authority key identifier (if any) matches itself, and its +// key usage extension (if any) allows certificate signatures. The signature +// itself is not checked in computing this bit. #define EXFLAG_SS 0x2000 // X509_get_extension_flags decodes a set of extensions from |x509| and returns @@ -2696,8 +2696,18 @@ OPENSSL_EXPORT void X509_ALGOR_get0(const ASN1_OBJECT **out_obj, // X509_ALGOR_set_md sets |alg| to the hash function |md|. Note this // AlgorithmIdentifier represents the hash function itself, not a signature -// algorithm that uses |md|. -OPENSSL_EXPORT void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md); +// algorithm that uses |md|. It returns one on success and zero on error. +// +// Due to historical specification mistakes (see Section 2.1 of RFC 4055), the +// parameters field is sometimes omitted and sometimes a NULL value. When used +// in RSASSA-PSS and RSAES-OAEP, it should be a NULL value. In other contexts, +// the parameters should be omitted. This function assumes the caller is +// constructing a RSASSA-PSS or RSAES-OAEP AlgorithmIdentifier and includes a +// NULL parameter. This differs from OpenSSL's behavior. +// +// TODO(davidben): Rename this function, or perhaps just add a bespoke API for +// constructing PSS and move on. +OPENSSL_EXPORT int X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md); // X509_ALGOR_cmp returns zero if |a| and |b| are equal, and some non-zero value // otherwise. Note this function can only be used for equality checks, not an @@ -3022,6 +3032,9 @@ OPENSSL_EXPORT int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *store, // |X509_STORE_CTX_get1_chain| may be used to return the verified certificate // chain. On error, |X509_STORE_CTX_get_error| may be used to return additional // error information. +// +// WARNING: Most failure conditions from this function do not use the error +// queue. Use |X509_STORE_CTX_get_error| to determine the cause of the error. OPENSSL_EXPORT int X509_verify_cert(X509_STORE_CTX *ctx); // X509_STORE_CTX_get0_chain, after a successful |X509_verify_cert| call, @@ -5281,29 +5294,6 @@ OPENSSL_EXPORT void X509_STORE_set_verify_cb( #define X509_STORE_set_verify_cb_func(store, func) \ X509_STORE_set_verify_cb((store), (func)) -typedef int (*X509_STORE_CTX_get_crl_fn)(X509_STORE_CTX *ctx, X509_CRL **crl, - X509 *x); -typedef int (*X509_STORE_CTX_check_crl_fn)(X509_STORE_CTX *ctx, X509_CRL *crl); - -// X509_STORE_set_get_crl override's |store|'s logic for looking up CRLs. -// -// Do not use this function. It is temporarily retained to support one caller -// and will be removed after that caller is fixed. It is not possible for -// external callers to correctly implement this callback. The real -// implementation sets some inaccessible internal state on |X509_STORE_CTX|. -OPENSSL_EXPORT void X509_STORE_set_get_crl(X509_STORE *store, - X509_STORE_CTX_get_crl_fn get_crl); - -// X509_STORE_set_check_crl override's |store|'s logic for checking CRL -// validity. -// -// Do not use this function. It is temporarily retained to support one caller -// and will be removed after that caller is fixed. It is not possible for -// external callers to correctly implement this callback. The real -// implementation relies some inaccessible internal state on |X509_STORE_CTX|. -OPENSSL_EXPORT void X509_STORE_set_check_crl( - X509_STORE *store, X509_STORE_CTX_check_crl_fn check_crl); - // X509_STORE_CTX_set_chain configures |ctx| to use |sk| for untrusted // intermediate certificates to use in verification. This function is redundant // with the |chain| parameter of |X509_STORE_CTX_init|. Use the parameter diff --git a/Sources/CCryptoBoringSSL/include/boringssl_prefix_symbols_nasm.inc b/Sources/CCryptoBoringSSL/include/boringssl_prefix_symbols_nasm.inc index 2f360bd2..517fba6f 100644 --- a/Sources/CCryptoBoringSSL/include/boringssl_prefix_symbols_nasm.inc +++ b/Sources/CCryptoBoringSSL/include/boringssl_prefix_symbols_nasm.inc @@ -207,6 +207,14 @@ %xdefine _BASIC_CONSTRAINTS_free _ %+ BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_free %xdefine _BASIC_CONSTRAINTS_it _ %+ BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_it %xdefine _BASIC_CONSTRAINTS_new _ %+ BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_new +%xdefine _BCM_fips_186_2_prf _ %+ BORINGSSL_PREFIX %+ _BCM_fips_186_2_prf +%xdefine _BCM_rand_bytes _ %+ BORINGSSL_PREFIX %+ _BCM_rand_bytes +%xdefine _BCM_rand_bytes_hwrng _ %+ BORINGSSL_PREFIX %+ _BCM_rand_bytes_hwrng +%xdefine _BCM_rand_bytes_with_additional_data _ %+ BORINGSSL_PREFIX %+ _BCM_rand_bytes_with_additional_data +%xdefine _BCM_sha1_final _ %+ BORINGSSL_PREFIX %+ _BCM_sha1_final +%xdefine _BCM_sha1_init _ %+ BORINGSSL_PREFIX %+ _BCM_sha1_init +%xdefine _BCM_sha1_transform _ %+ BORINGSSL_PREFIX %+ _BCM_sha1_transform +%xdefine _BCM_sha1_update _ %+ BORINGSSL_PREFIX %+ _BCM_sha1_update %xdefine _BIO_append_filename _ %+ BORINGSSL_PREFIX %+ _BIO_append_filename %xdefine _BIO_callback_ctrl _ %+ BORINGSSL_PREFIX %+ _BIO_callback_ctrl %xdefine _BIO_clear_flags _ %+ BORINGSSL_PREFIX %+ _BIO_clear_flags @@ -713,6 +721,16 @@ %xdefine _DH_size _ %+ BORINGSSL_PREFIX %+ _DH_size %xdefine _DH_up_ref _ %+ BORINGSSL_PREFIX %+ _DH_up_ref %xdefine _DHparams_dup _ %+ BORINGSSL_PREFIX %+ _DHparams_dup +%xdefine _DILITHIUM_generate_key _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_generate_key +%xdefine _DILITHIUM_generate_key_external_entropy _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_generate_key_external_entropy +%xdefine _DILITHIUM_marshal_private_key _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_marshal_private_key +%xdefine _DILITHIUM_marshal_public_key _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_marshal_public_key +%xdefine _DILITHIUM_parse_private_key _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_parse_private_key +%xdefine _DILITHIUM_parse_public_key _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_parse_public_key +%xdefine _DILITHIUM_public_from_private _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_public_from_private +%xdefine _DILITHIUM_sign _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_sign +%xdefine _DILITHIUM_sign_deterministic _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_sign_deterministic +%xdefine _DILITHIUM_verify _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_verify %xdefine _DIRECTORYSTRING_free _ %+ BORINGSSL_PREFIX %+ _DIRECTORYSTRING_free %xdefine _DIRECTORYSTRING_it _ %+ BORINGSSL_PREFIX %+ _DIRECTORYSTRING_it %xdefine _DIRECTORYSTRING_new _ %+ BORINGSSL_PREFIX %+ _DIRECTORYSTRING_new @@ -1094,6 +1112,7 @@ %xdefine _EVP_PKEY_CTX_set0_rsa_oaep_label _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set0_rsa_oaep_label %xdefine _EVP_PKEY_CTX_set1_hkdf_key _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set1_hkdf_key %xdefine _EVP_PKEY_CTX_set1_hkdf_salt _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set1_hkdf_salt +%xdefine _EVP_PKEY_CTX_set_dh_pad _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dh_pad %xdefine _EVP_PKEY_CTX_set_dsa_paramgen_bits _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dsa_paramgen_bits %xdefine _EVP_PKEY_CTX_set_dsa_paramgen_q_bits _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dsa_paramgen_q_bits %xdefine _EVP_PKEY_CTX_set_ec_param_enc _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_ec_param_enc @@ -1110,6 +1129,7 @@ %xdefine _EVP_PKEY_CTX_set_rsa_pss_saltlen _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_rsa_pss_saltlen %xdefine _EVP_PKEY_CTX_set_signature_md _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_signature_md %xdefine _EVP_PKEY_assign _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_assign +%xdefine _EVP_PKEY_assign_DH _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_assign_DH %xdefine _EVP_PKEY_assign_DSA _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_assign_DSA %xdefine _EVP_PKEY_assign_EC_KEY _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_assign_EC_KEY %xdefine _EVP_PKEY_assign_RSA _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_assign_RSA @@ -1151,6 +1171,7 @@ %xdefine _EVP_PKEY_print_params _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_print_params %xdefine _EVP_PKEY_print_private _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_print_private %xdefine _EVP_PKEY_print_public _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_print_public +%xdefine _EVP_PKEY_set1_DH _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_set1_DH %xdefine _EVP_PKEY_set1_DSA _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_set1_DSA %xdefine _EVP_PKEY_set1_EC_KEY _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_set1_EC_KEY %xdefine _EVP_PKEY_set1_RSA _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_set1_RSA @@ -1235,6 +1256,7 @@ %xdefine _EVP_hpke_aes_256_gcm _ %+ BORINGSSL_PREFIX %+ _EVP_hpke_aes_256_gcm %xdefine _EVP_hpke_chacha20_poly1305 _ %+ BORINGSSL_PREFIX %+ _EVP_hpke_chacha20_poly1305 %xdefine _EVP_hpke_hkdf_sha256 _ %+ BORINGSSL_PREFIX %+ _EVP_hpke_hkdf_sha256 +%xdefine _EVP_hpke_p256_hkdf_sha256 _ %+ BORINGSSL_PREFIX %+ _EVP_hpke_p256_hkdf_sha256 %xdefine _EVP_hpke_x25519_hkdf_sha256 _ %+ BORINGSSL_PREFIX %+ _EVP_hpke_x25519_hkdf_sha256 %xdefine _EVP_marshal_digest_algorithm _ %+ BORINGSSL_PREFIX %+ _EVP_marshal_digest_algorithm %xdefine _EVP_marshal_private_key _ %+ BORINGSSL_PREFIX %+ _EVP_marshal_private_key @@ -1337,6 +1359,18 @@ %xdefine _MD5_Update _ %+ BORINGSSL_PREFIX %+ _MD5_Update %xdefine _METHOD_ref _ %+ BORINGSSL_PREFIX %+ _METHOD_ref %xdefine _METHOD_unref _ %+ BORINGSSL_PREFIX %+ _METHOD_unref +%xdefine _MLDSA65_generate_key _ %+ BORINGSSL_PREFIX %+ _MLDSA65_generate_key +%xdefine _MLDSA65_generate_key_external_entropy _ %+ BORINGSSL_PREFIX %+ _MLDSA65_generate_key_external_entropy +%xdefine _MLDSA65_marshal_private_key _ %+ BORINGSSL_PREFIX %+ _MLDSA65_marshal_private_key +%xdefine _MLDSA65_marshal_public_key _ %+ BORINGSSL_PREFIX %+ _MLDSA65_marshal_public_key +%xdefine _MLDSA65_parse_private_key _ %+ BORINGSSL_PREFIX %+ _MLDSA65_parse_private_key +%xdefine _MLDSA65_parse_public_key _ %+ BORINGSSL_PREFIX %+ _MLDSA65_parse_public_key +%xdefine _MLDSA65_private_key_from_seed _ %+ BORINGSSL_PREFIX %+ _MLDSA65_private_key_from_seed +%xdefine _MLDSA65_public_from_private _ %+ BORINGSSL_PREFIX %+ _MLDSA65_public_from_private +%xdefine _MLDSA65_sign _ %+ BORINGSSL_PREFIX %+ _MLDSA65_sign +%xdefine _MLDSA65_sign_internal _ %+ BORINGSSL_PREFIX %+ _MLDSA65_sign_internal +%xdefine _MLDSA65_verify _ %+ BORINGSSL_PREFIX %+ _MLDSA65_verify +%xdefine _MLDSA65_verify_internal _ %+ BORINGSSL_PREFIX %+ _MLDSA65_verify_internal %xdefine _NAME_CONSTRAINTS_check _ %+ BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_check %xdefine _NAME_CONSTRAINTS_free _ %+ BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_free %xdefine _NAME_CONSTRAINTS_it _ %+ BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_it @@ -1362,6 +1396,8 @@ %xdefine _NOTICEREF_free _ %+ BORINGSSL_PREFIX %+ _NOTICEREF_free %xdefine _NOTICEREF_it _ %+ BORINGSSL_PREFIX %+ _NOTICEREF_it %xdefine _NOTICEREF_new _ %+ BORINGSSL_PREFIX %+ _NOTICEREF_new +%xdefine _OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER _ %+ BORINGSSL_PREFIX %+ _OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER +%xdefine _OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER _ %+ BORINGSSL_PREFIX %+ _OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER %xdefine _OBJ_cbs2nid _ %+ BORINGSSL_PREFIX %+ _OBJ_cbs2nid %xdefine _OBJ_cleanup _ %+ BORINGSSL_PREFIX %+ _OBJ_cleanup %xdefine _OBJ_cmp _ %+ BORINGSSL_PREFIX %+ _OBJ_cmp @@ -1401,6 +1437,7 @@ %xdefine _OPENSSL_gmtime_diff _ %+ BORINGSSL_PREFIX %+ _OPENSSL_gmtime_diff %xdefine _OPENSSL_hash32 _ %+ BORINGSSL_PREFIX %+ _OPENSSL_hash32 %xdefine _OPENSSL_ia32cap_P _ %+ BORINGSSL_PREFIX %+ _OPENSSL_ia32cap_P +%xdefine _OPENSSL_init_cpuid _ %+ BORINGSSL_PREFIX %+ _OPENSSL_init_cpuid %xdefine _OPENSSL_init_crypto _ %+ BORINGSSL_PREFIX %+ _OPENSSL_init_crypto %xdefine _OPENSSL_isalnum _ %+ BORINGSSL_PREFIX %+ _OPENSSL_isalnum %xdefine _OPENSSL_isalpha _ %+ BORINGSSL_PREFIX %+ _OPENSSL_isalpha @@ -1610,7 +1647,6 @@ %xdefine _RAND_SSLeay _ %+ BORINGSSL_PREFIX %+ _RAND_SSLeay %xdefine _RAND_add _ %+ BORINGSSL_PREFIX %+ _RAND_add %xdefine _RAND_bytes _ %+ BORINGSSL_PREFIX %+ _RAND_bytes -%xdefine _RAND_bytes_with_additional_data _ %+ BORINGSSL_PREFIX %+ _RAND_bytes_with_additional_data %xdefine _RAND_cleanup _ %+ BORINGSSL_PREFIX %+ _RAND_cleanup %xdefine _RAND_disable_fork_unsafe_buffering _ %+ BORINGSSL_PREFIX %+ _RAND_disable_fork_unsafe_buffering %xdefine _RAND_egd _ %+ BORINGSSL_PREFIX %+ _RAND_egd @@ -1734,6 +1770,10 @@ %xdefine _SPAKE2_CTX_new _ %+ BORINGSSL_PREFIX %+ _SPAKE2_CTX_new %xdefine _SPAKE2_generate_msg _ %+ BORINGSSL_PREFIX %+ _SPAKE2_generate_msg %xdefine _SPAKE2_process_msg _ %+ BORINGSSL_PREFIX %+ _SPAKE2_process_msg +%xdefine _SPX_generate_key _ %+ BORINGSSL_PREFIX %+ _SPX_generate_key +%xdefine _SPX_generate_key_from_seed _ %+ BORINGSSL_PREFIX %+ _SPX_generate_key_from_seed +%xdefine _SPX_sign _ %+ BORINGSSL_PREFIX %+ _SPX_sign +%xdefine _SPX_verify _ %+ BORINGSSL_PREFIX %+ _SPX_verify %xdefine _SSLeay _ %+ BORINGSSL_PREFIX %+ _SSLeay %xdefine _SSLeay_version _ %+ BORINGSSL_PREFIX %+ _SSLeay_version %xdefine _TRUST_TOKEN_CLIENT_add_key _ %+ BORINGSSL_PREFIX %+ _TRUST_TOKEN_CLIENT_add_key @@ -2062,11 +2102,9 @@ %xdefine _X509_STORE_load_locations _ %+ BORINGSSL_PREFIX %+ _X509_STORE_load_locations %xdefine _X509_STORE_new _ %+ BORINGSSL_PREFIX %+ _X509_STORE_new %xdefine _X509_STORE_set1_param _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set1_param -%xdefine _X509_STORE_set_check_crl _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_check_crl %xdefine _X509_STORE_set_default_paths _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_default_paths %xdefine _X509_STORE_set_depth _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_depth %xdefine _X509_STORE_set_flags _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_flags -%xdefine _X509_STORE_set_get_crl _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_get_crl %xdefine _X509_STORE_set_purpose _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_purpose %xdefine _X509_STORE_set_trust _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_trust %xdefine _X509_STORE_set_verify_cb _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_verify_cb @@ -2246,8 +2284,11 @@ %xdefine _aes_hw_decrypt _ %+ BORINGSSL_PREFIX %+ _aes_hw_decrypt %xdefine _aes_hw_ecb_encrypt _ %+ BORINGSSL_PREFIX %+ _aes_hw_ecb_encrypt %xdefine _aes_hw_encrypt _ %+ BORINGSSL_PREFIX %+ _aes_hw_encrypt +%xdefine _aes_hw_encrypt_key_to_decrypt_key _ %+ BORINGSSL_PREFIX %+ _aes_hw_encrypt_key_to_decrypt_key %xdefine _aes_hw_set_decrypt_key _ %+ BORINGSSL_PREFIX %+ _aes_hw_set_decrypt_key %xdefine _aes_hw_set_encrypt_key _ %+ BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key +%xdefine _aes_hw_set_encrypt_key_alt _ %+ BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key_alt +%xdefine _aes_hw_set_encrypt_key_base _ %+ BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key_base %xdefine _aes_nohw_cbc_encrypt _ %+ BORINGSSL_PREFIX %+ _aes_nohw_cbc_encrypt %xdefine _aes_nohw_ctr32_encrypt_blocks _ %+ BORINGSSL_PREFIX %+ _aes_nohw_ctr32_encrypt_blocks %xdefine _aes_nohw_decrypt _ %+ BORINGSSL_PREFIX %+ _aes_nohw_decrypt @@ -2319,19 +2360,22 @@ %xdefine _bn_mont_ctx_set_RR_consttime _ %+ BORINGSSL_PREFIX %+ _bn_mont_ctx_set_RR_consttime %xdefine _bn_mont_n0 _ %+ BORINGSSL_PREFIX %+ _bn_mont_n0 %xdefine _bn_mul4x_mont _ %+ BORINGSSL_PREFIX %+ _bn_mul4x_mont +%xdefine _bn_mul4x_mont_gather5 _ %+ BORINGSSL_PREFIX %+ _bn_mul4x_mont_gather5 %xdefine _bn_mul_add_words _ %+ BORINGSSL_PREFIX %+ _bn_mul_add_words %xdefine _bn_mul_comba4 _ %+ BORINGSSL_PREFIX %+ _bn_mul_comba4 %xdefine _bn_mul_comba8 _ %+ BORINGSSL_PREFIX %+ _bn_mul_comba8 %xdefine _bn_mul_consttime _ %+ BORINGSSL_PREFIX %+ _bn_mul_consttime %xdefine _bn_mul_mont _ %+ BORINGSSL_PREFIX %+ _bn_mul_mont -%xdefine _bn_mul_mont_gather5 _ %+ BORINGSSL_PREFIX %+ _bn_mul_mont_gather5 +%xdefine _bn_mul_mont_gather5_nohw _ %+ BORINGSSL_PREFIX %+ _bn_mul_mont_gather5_nohw %xdefine _bn_mul_mont_nohw _ %+ BORINGSSL_PREFIX %+ _bn_mul_mont_nohw %xdefine _bn_mul_small _ %+ BORINGSSL_PREFIX %+ _bn_mul_small %xdefine _bn_mul_words _ %+ BORINGSSL_PREFIX %+ _bn_mul_words %xdefine _bn_mulx4x_mont _ %+ BORINGSSL_PREFIX %+ _bn_mulx4x_mont +%xdefine _bn_mulx4x_mont_gather5 _ %+ BORINGSSL_PREFIX %+ _bn_mulx4x_mont_gather5 %xdefine _bn_odd_number_is_obviously_composite _ %+ BORINGSSL_PREFIX %+ _bn_odd_number_is_obviously_composite %xdefine _bn_one_to_montgomery _ %+ BORINGSSL_PREFIX %+ _bn_one_to_montgomery -%xdefine _bn_power5 _ %+ BORINGSSL_PREFIX %+ _bn_power5 +%xdefine _bn_power5_nohw _ %+ BORINGSSL_PREFIX %+ _bn_power5_nohw +%xdefine _bn_powerx5 _ %+ BORINGSSL_PREFIX %+ _bn_powerx5 %xdefine _bn_rand_range_words _ %+ BORINGSSL_PREFIX %+ _bn_rand_range_words %xdefine _bn_rand_secret_range _ %+ BORINGSSL_PREFIX %+ _bn_rand_secret_range %xdefine _bn_reduce_once _ %+ BORINGSSL_PREFIX %+ _bn_reduce_once @@ -2366,7 +2410,11 @@ %xdefine _c2i_ASN1_INTEGER _ %+ BORINGSSL_PREFIX %+ _c2i_ASN1_INTEGER %xdefine _c2i_ASN1_OBJECT _ %+ BORINGSSL_PREFIX %+ _c2i_ASN1_OBJECT %xdefine _chacha20_poly1305_open _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_open +%xdefine _chacha20_poly1305_open_avx2 _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_open_avx2 +%xdefine _chacha20_poly1305_open_nohw _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_open_nohw %xdefine _chacha20_poly1305_seal _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_seal +%xdefine _chacha20_poly1305_seal_avx2 _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_seal_avx2 +%xdefine _chacha20_poly1305_seal_nohw _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_seal_nohw %xdefine _crypto_gcm_clmul_enabled _ %+ BORINGSSL_PREFIX %+ _crypto_gcm_clmul_enabled %xdefine _d2i_ASN1_BIT_STRING _ %+ BORINGSSL_PREFIX %+ _d2i_ASN1_BIT_STRING %xdefine _d2i_ASN1_BMPSTRING _ %+ BORINGSSL_PREFIX %+ _d2i_ASN1_BMPSTRING @@ -2475,8 +2523,10 @@ %xdefine _d2i_X509_VAL _ %+ BORINGSSL_PREFIX %+ _d2i_X509_VAL %xdefine _d2i_X509_bio _ %+ BORINGSSL_PREFIX %+ _d2i_X509_bio %xdefine _d2i_X509_fp _ %+ BORINGSSL_PREFIX %+ _d2i_X509_fp +%xdefine _dh_asn1_meth _ %+ BORINGSSL_PREFIX %+ _dh_asn1_meth %xdefine _dh_check_params_fast _ %+ BORINGSSL_PREFIX %+ _dh_check_params_fast %xdefine _dh_compute_key_padded_no_self_test _ %+ BORINGSSL_PREFIX %+ _dh_compute_key_padded_no_self_test +%xdefine _dh_pkey_meth _ %+ BORINGSSL_PREFIX %+ _dh_pkey_meth %xdefine _dsa_asn1_meth _ %+ BORINGSSL_PREFIX %+ _dsa_asn1_meth %xdefine _dsa_check_key _ %+ BORINGSSL_PREFIX %+ _dsa_check_key %xdefine _ec_GFp_mont_add _ %+ BORINGSSL_PREFIX %+ _ec_GFp_mont_add @@ -2566,25 +2616,46 @@ %xdefine _ec_set_to_safe_point _ %+ BORINGSSL_PREFIX %+ _ec_set_to_safe_point %xdefine _ec_simple_scalar_inv0_montgomery _ %+ BORINGSSL_PREFIX %+ _ec_simple_scalar_inv0_montgomery %xdefine _ec_simple_scalar_to_montgomery_inv_vartime _ %+ BORINGSSL_PREFIX %+ _ec_simple_scalar_to_montgomery_inv_vartime -%xdefine _ecdsa_do_verify_no_self_test _ %+ BORINGSSL_PREFIX %+ _ecdsa_do_verify_no_self_test -%xdefine _ecdsa_sign_with_nonce_for_known_answer_test _ %+ BORINGSSL_PREFIX %+ _ecdsa_sign_with_nonce_for_known_answer_test -%xdefine _ecp_nistz256_avx2_select_w7 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_avx2_select_w7 +%xdefine _ecdsa_sign_fixed _ %+ BORINGSSL_PREFIX %+ _ecdsa_sign_fixed +%xdefine _ecdsa_sign_fixed_with_nonce_for_known_answer_test _ %+ BORINGSSL_PREFIX %+ _ecdsa_sign_fixed_with_nonce_for_known_answer_test +%xdefine _ecdsa_verify_fixed _ %+ BORINGSSL_PREFIX %+ _ecdsa_verify_fixed +%xdefine _ecdsa_verify_fixed_no_self_test _ %+ BORINGSSL_PREFIX %+ _ecdsa_verify_fixed_no_self_test %xdefine _ecp_nistz256_div_by_2 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_div_by_2 %xdefine _ecp_nistz256_mul_by_2 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_mul_by_2 %xdefine _ecp_nistz256_mul_by_3 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_mul_by_3 %xdefine _ecp_nistz256_mul_mont _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont +%xdefine _ecp_nistz256_mul_mont_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont_adx +%xdefine _ecp_nistz256_mul_mont_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont_nohw %xdefine _ecp_nistz256_neg _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_neg %xdefine _ecp_nistz256_ord_mul_mont _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont +%xdefine _ecp_nistz256_ord_mul_mont_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont_adx +%xdefine _ecp_nistz256_ord_mul_mont_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont_nohw %xdefine _ecp_nistz256_ord_sqr_mont _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont +%xdefine _ecp_nistz256_ord_sqr_mont_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont_adx +%xdefine _ecp_nistz256_ord_sqr_mont_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont_nohw %xdefine _ecp_nistz256_point_add _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add +%xdefine _ecp_nistz256_point_add_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_adx %xdefine _ecp_nistz256_point_add_affine _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine +%xdefine _ecp_nistz256_point_add_affine_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine_adx +%xdefine _ecp_nistz256_point_add_affine_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine_nohw +%xdefine _ecp_nistz256_point_add_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_nohw %xdefine _ecp_nistz256_point_double _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_double +%xdefine _ecp_nistz256_point_double_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_double_adx +%xdefine _ecp_nistz256_point_double_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_double_nohw %xdefine _ecp_nistz256_select_w5 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5 +%xdefine _ecp_nistz256_select_w5_avx2 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5_avx2 +%xdefine _ecp_nistz256_select_w5_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5_nohw %xdefine _ecp_nistz256_select_w7 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7 +%xdefine _ecp_nistz256_select_w7_avx2 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7_avx2 +%xdefine _ecp_nistz256_select_w7_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7_nohw %xdefine _ecp_nistz256_sqr_mont _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont +%xdefine _ecp_nistz256_sqr_mont_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont_adx +%xdefine _ecp_nistz256_sqr_mont_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont_nohw %xdefine _ecp_nistz256_sub _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_sub %xdefine _ed25519_asn1_meth _ %+ BORINGSSL_PREFIX %+ _ed25519_asn1_meth %xdefine _ed25519_pkey_meth _ %+ BORINGSSL_PREFIX %+ _ed25519_pkey_meth +%xdefine _evp_md_md5_sha1 _ %+ BORINGSSL_PREFIX %+ _evp_md_md5_sha1 +%xdefine _evp_pkey_set_method _ %+ BORINGSSL_PREFIX %+ _evp_pkey_set_method %xdefine _fiat_curve25519_adx_mul _ %+ BORINGSSL_PREFIX %+ _fiat_curve25519_adx_mul %xdefine _fiat_curve25519_adx_square _ %+ BORINGSSL_PREFIX %+ _fiat_curve25519_adx_square %xdefine _fiat_p256_adx_mul _ %+ BORINGSSL_PREFIX %+ _fiat_p256_adx_mul @@ -2826,8 +2897,6 @@ %xdefine _spx_fors_sign _ %+ BORINGSSL_PREFIX %+ _spx_fors_sign %xdefine _spx_fors_sk_gen _ %+ BORINGSSL_PREFIX %+ _spx_fors_sk_gen %xdefine _spx_fors_treehash _ %+ BORINGSSL_PREFIX %+ _spx_fors_treehash -%xdefine _spx_generate_key _ %+ BORINGSSL_PREFIX %+ _spx_generate_key -%xdefine _spx_generate_key_from_seed _ %+ BORINGSSL_PREFIX %+ _spx_generate_key_from_seed %xdefine _spx_get_tree_index _ %+ BORINGSSL_PREFIX %+ _spx_get_tree_index %xdefine _spx_ht_sign _ %+ BORINGSSL_PREFIX %+ _spx_ht_sign %xdefine _spx_ht_verify _ %+ BORINGSSL_PREFIX %+ _spx_ht_verify @@ -2839,7 +2908,6 @@ %xdefine _spx_set_tree_height _ %+ BORINGSSL_PREFIX %+ _spx_set_tree_height %xdefine _spx_set_tree_index _ %+ BORINGSSL_PREFIX %+ _spx_set_tree_index %xdefine _spx_set_type _ %+ BORINGSSL_PREFIX %+ _spx_set_type -%xdefine _spx_sign _ %+ BORINGSSL_PREFIX %+ _spx_sign %xdefine _spx_thash_f _ %+ BORINGSSL_PREFIX %+ _spx_thash_f %xdefine _spx_thash_h _ %+ BORINGSSL_PREFIX %+ _spx_thash_h %xdefine _spx_thash_hmsg _ %+ BORINGSSL_PREFIX %+ _spx_thash_hmsg @@ -2850,12 +2918,12 @@ %xdefine _spx_to_uint64 _ %+ BORINGSSL_PREFIX %+ _spx_to_uint64 %xdefine _spx_treehash _ %+ BORINGSSL_PREFIX %+ _spx_treehash %xdefine _spx_uint64_to_len_bytes _ %+ BORINGSSL_PREFIX %+ _spx_uint64_to_len_bytes -%xdefine _spx_verify _ %+ BORINGSSL_PREFIX %+ _spx_verify %xdefine _spx_wots_pk_from_sig _ %+ BORINGSSL_PREFIX %+ _spx_wots_pk_from_sig %xdefine _spx_wots_pk_gen _ %+ BORINGSSL_PREFIX %+ _spx_wots_pk_gen %xdefine _spx_wots_sign _ %+ BORINGSSL_PREFIX %+ _spx_wots_sign %xdefine _spx_xmss_pk_from_sig _ %+ BORINGSSL_PREFIX %+ _spx_xmss_pk_from_sig %xdefine _spx_xmss_sign _ %+ BORINGSSL_PREFIX %+ _spx_xmss_sign +%xdefine _swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE _ %+ BORINGSSL_PREFIX %+ _swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE %xdefine _v2i_GENERAL_NAME _ %+ BORINGSSL_PREFIX %+ _v2i_GENERAL_NAME %xdefine _v2i_GENERAL_NAMES _ %+ BORINGSSL_PREFIX %+ _v2i_GENERAL_NAMES %xdefine _v2i_GENERAL_NAME_ex _ %+ BORINGSSL_PREFIX %+ _v2i_GENERAL_NAME_ex @@ -3128,6 +3196,14 @@ %xdefine BASIC_CONSTRAINTS_free BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_free %xdefine BASIC_CONSTRAINTS_it BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_it %xdefine BASIC_CONSTRAINTS_new BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_new +%xdefine BCM_fips_186_2_prf BORINGSSL_PREFIX %+ _BCM_fips_186_2_prf +%xdefine BCM_rand_bytes BORINGSSL_PREFIX %+ _BCM_rand_bytes +%xdefine BCM_rand_bytes_hwrng BORINGSSL_PREFIX %+ _BCM_rand_bytes_hwrng +%xdefine BCM_rand_bytes_with_additional_data BORINGSSL_PREFIX %+ _BCM_rand_bytes_with_additional_data +%xdefine BCM_sha1_final BORINGSSL_PREFIX %+ _BCM_sha1_final +%xdefine BCM_sha1_init BORINGSSL_PREFIX %+ _BCM_sha1_init +%xdefine BCM_sha1_transform BORINGSSL_PREFIX %+ _BCM_sha1_transform +%xdefine BCM_sha1_update BORINGSSL_PREFIX %+ _BCM_sha1_update %xdefine BIO_append_filename BORINGSSL_PREFIX %+ _BIO_append_filename %xdefine BIO_callback_ctrl BORINGSSL_PREFIX %+ _BIO_callback_ctrl %xdefine BIO_clear_flags BORINGSSL_PREFIX %+ _BIO_clear_flags @@ -3634,6 +3710,16 @@ %xdefine DH_size BORINGSSL_PREFIX %+ _DH_size %xdefine DH_up_ref BORINGSSL_PREFIX %+ _DH_up_ref %xdefine DHparams_dup BORINGSSL_PREFIX %+ _DHparams_dup +%xdefine DILITHIUM_generate_key BORINGSSL_PREFIX %+ _DILITHIUM_generate_key +%xdefine DILITHIUM_generate_key_external_entropy BORINGSSL_PREFIX %+ _DILITHIUM_generate_key_external_entropy +%xdefine DILITHIUM_marshal_private_key BORINGSSL_PREFIX %+ _DILITHIUM_marshal_private_key +%xdefine DILITHIUM_marshal_public_key BORINGSSL_PREFIX %+ _DILITHIUM_marshal_public_key +%xdefine DILITHIUM_parse_private_key BORINGSSL_PREFIX %+ _DILITHIUM_parse_private_key +%xdefine DILITHIUM_parse_public_key BORINGSSL_PREFIX %+ _DILITHIUM_parse_public_key +%xdefine DILITHIUM_public_from_private BORINGSSL_PREFIX %+ _DILITHIUM_public_from_private +%xdefine DILITHIUM_sign BORINGSSL_PREFIX %+ _DILITHIUM_sign +%xdefine DILITHIUM_sign_deterministic BORINGSSL_PREFIX %+ _DILITHIUM_sign_deterministic +%xdefine DILITHIUM_verify BORINGSSL_PREFIX %+ _DILITHIUM_verify %xdefine DIRECTORYSTRING_free BORINGSSL_PREFIX %+ _DIRECTORYSTRING_free %xdefine DIRECTORYSTRING_it BORINGSSL_PREFIX %+ _DIRECTORYSTRING_it %xdefine DIRECTORYSTRING_new BORINGSSL_PREFIX %+ _DIRECTORYSTRING_new @@ -4015,6 +4101,7 @@ %xdefine EVP_PKEY_CTX_set0_rsa_oaep_label BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set0_rsa_oaep_label %xdefine EVP_PKEY_CTX_set1_hkdf_key BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set1_hkdf_key %xdefine EVP_PKEY_CTX_set1_hkdf_salt BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set1_hkdf_salt +%xdefine EVP_PKEY_CTX_set_dh_pad BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dh_pad %xdefine EVP_PKEY_CTX_set_dsa_paramgen_bits BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dsa_paramgen_bits %xdefine EVP_PKEY_CTX_set_dsa_paramgen_q_bits BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dsa_paramgen_q_bits %xdefine EVP_PKEY_CTX_set_ec_param_enc BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_ec_param_enc @@ -4031,6 +4118,7 @@ %xdefine EVP_PKEY_CTX_set_rsa_pss_saltlen BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_rsa_pss_saltlen %xdefine EVP_PKEY_CTX_set_signature_md BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_signature_md %xdefine EVP_PKEY_assign BORINGSSL_PREFIX %+ _EVP_PKEY_assign +%xdefine EVP_PKEY_assign_DH BORINGSSL_PREFIX %+ _EVP_PKEY_assign_DH %xdefine EVP_PKEY_assign_DSA BORINGSSL_PREFIX %+ _EVP_PKEY_assign_DSA %xdefine EVP_PKEY_assign_EC_KEY BORINGSSL_PREFIX %+ _EVP_PKEY_assign_EC_KEY %xdefine EVP_PKEY_assign_RSA BORINGSSL_PREFIX %+ _EVP_PKEY_assign_RSA @@ -4072,6 +4160,7 @@ %xdefine EVP_PKEY_print_params BORINGSSL_PREFIX %+ _EVP_PKEY_print_params %xdefine EVP_PKEY_print_private BORINGSSL_PREFIX %+ _EVP_PKEY_print_private %xdefine EVP_PKEY_print_public BORINGSSL_PREFIX %+ _EVP_PKEY_print_public +%xdefine EVP_PKEY_set1_DH BORINGSSL_PREFIX %+ _EVP_PKEY_set1_DH %xdefine EVP_PKEY_set1_DSA BORINGSSL_PREFIX %+ _EVP_PKEY_set1_DSA %xdefine EVP_PKEY_set1_EC_KEY BORINGSSL_PREFIX %+ _EVP_PKEY_set1_EC_KEY %xdefine EVP_PKEY_set1_RSA BORINGSSL_PREFIX %+ _EVP_PKEY_set1_RSA @@ -4156,6 +4245,7 @@ %xdefine EVP_hpke_aes_256_gcm BORINGSSL_PREFIX %+ _EVP_hpke_aes_256_gcm %xdefine EVP_hpke_chacha20_poly1305 BORINGSSL_PREFIX %+ _EVP_hpke_chacha20_poly1305 %xdefine EVP_hpke_hkdf_sha256 BORINGSSL_PREFIX %+ _EVP_hpke_hkdf_sha256 +%xdefine EVP_hpke_p256_hkdf_sha256 BORINGSSL_PREFIX %+ _EVP_hpke_p256_hkdf_sha256 %xdefine EVP_hpke_x25519_hkdf_sha256 BORINGSSL_PREFIX %+ _EVP_hpke_x25519_hkdf_sha256 %xdefine EVP_marshal_digest_algorithm BORINGSSL_PREFIX %+ _EVP_marshal_digest_algorithm %xdefine EVP_marshal_private_key BORINGSSL_PREFIX %+ _EVP_marshal_private_key @@ -4258,6 +4348,18 @@ %xdefine MD5_Update BORINGSSL_PREFIX %+ _MD5_Update %xdefine METHOD_ref BORINGSSL_PREFIX %+ _METHOD_ref %xdefine METHOD_unref BORINGSSL_PREFIX %+ _METHOD_unref +%xdefine MLDSA65_generate_key BORINGSSL_PREFIX %+ _MLDSA65_generate_key +%xdefine MLDSA65_generate_key_external_entropy BORINGSSL_PREFIX %+ _MLDSA65_generate_key_external_entropy +%xdefine MLDSA65_marshal_private_key BORINGSSL_PREFIX %+ _MLDSA65_marshal_private_key +%xdefine MLDSA65_marshal_public_key BORINGSSL_PREFIX %+ _MLDSA65_marshal_public_key +%xdefine MLDSA65_parse_private_key BORINGSSL_PREFIX %+ _MLDSA65_parse_private_key +%xdefine MLDSA65_parse_public_key BORINGSSL_PREFIX %+ _MLDSA65_parse_public_key +%xdefine MLDSA65_private_key_from_seed BORINGSSL_PREFIX %+ _MLDSA65_private_key_from_seed +%xdefine MLDSA65_public_from_private BORINGSSL_PREFIX %+ _MLDSA65_public_from_private +%xdefine MLDSA65_sign BORINGSSL_PREFIX %+ _MLDSA65_sign +%xdefine MLDSA65_sign_internal BORINGSSL_PREFIX %+ _MLDSA65_sign_internal +%xdefine MLDSA65_verify BORINGSSL_PREFIX %+ _MLDSA65_verify +%xdefine MLDSA65_verify_internal BORINGSSL_PREFIX %+ _MLDSA65_verify_internal %xdefine NAME_CONSTRAINTS_check BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_check %xdefine NAME_CONSTRAINTS_free BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_free %xdefine NAME_CONSTRAINTS_it BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_it @@ -4283,6 +4385,8 @@ %xdefine NOTICEREF_free BORINGSSL_PREFIX %+ _NOTICEREF_free %xdefine NOTICEREF_it BORINGSSL_PREFIX %+ _NOTICEREF_it %xdefine NOTICEREF_new BORINGSSL_PREFIX %+ _NOTICEREF_new +%xdefine OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_PREFIX %+ _OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER +%xdefine OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_PREFIX %+ _OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER %xdefine OBJ_cbs2nid BORINGSSL_PREFIX %+ _OBJ_cbs2nid %xdefine OBJ_cleanup BORINGSSL_PREFIX %+ _OBJ_cleanup %xdefine OBJ_cmp BORINGSSL_PREFIX %+ _OBJ_cmp @@ -4322,6 +4426,7 @@ %xdefine OPENSSL_gmtime_diff BORINGSSL_PREFIX %+ _OPENSSL_gmtime_diff %xdefine OPENSSL_hash32 BORINGSSL_PREFIX %+ _OPENSSL_hash32 %xdefine OPENSSL_ia32cap_P BORINGSSL_PREFIX %+ _OPENSSL_ia32cap_P +%xdefine OPENSSL_init_cpuid BORINGSSL_PREFIX %+ _OPENSSL_init_cpuid %xdefine OPENSSL_init_crypto BORINGSSL_PREFIX %+ _OPENSSL_init_crypto %xdefine OPENSSL_isalnum BORINGSSL_PREFIX %+ _OPENSSL_isalnum %xdefine OPENSSL_isalpha BORINGSSL_PREFIX %+ _OPENSSL_isalpha @@ -4531,7 +4636,6 @@ %xdefine RAND_SSLeay BORINGSSL_PREFIX %+ _RAND_SSLeay %xdefine RAND_add BORINGSSL_PREFIX %+ _RAND_add %xdefine RAND_bytes BORINGSSL_PREFIX %+ _RAND_bytes -%xdefine RAND_bytes_with_additional_data BORINGSSL_PREFIX %+ _RAND_bytes_with_additional_data %xdefine RAND_cleanup BORINGSSL_PREFIX %+ _RAND_cleanup %xdefine RAND_disable_fork_unsafe_buffering BORINGSSL_PREFIX %+ _RAND_disable_fork_unsafe_buffering %xdefine RAND_egd BORINGSSL_PREFIX %+ _RAND_egd @@ -4655,6 +4759,10 @@ %xdefine SPAKE2_CTX_new BORINGSSL_PREFIX %+ _SPAKE2_CTX_new %xdefine SPAKE2_generate_msg BORINGSSL_PREFIX %+ _SPAKE2_generate_msg %xdefine SPAKE2_process_msg BORINGSSL_PREFIX %+ _SPAKE2_process_msg +%xdefine SPX_generate_key BORINGSSL_PREFIX %+ _SPX_generate_key +%xdefine SPX_generate_key_from_seed BORINGSSL_PREFIX %+ _SPX_generate_key_from_seed +%xdefine SPX_sign BORINGSSL_PREFIX %+ _SPX_sign +%xdefine SPX_verify BORINGSSL_PREFIX %+ _SPX_verify %xdefine SSLeay BORINGSSL_PREFIX %+ _SSLeay %xdefine SSLeay_version BORINGSSL_PREFIX %+ _SSLeay_version %xdefine TRUST_TOKEN_CLIENT_add_key BORINGSSL_PREFIX %+ _TRUST_TOKEN_CLIENT_add_key @@ -4983,11 +5091,9 @@ %xdefine X509_STORE_load_locations BORINGSSL_PREFIX %+ _X509_STORE_load_locations %xdefine X509_STORE_new BORINGSSL_PREFIX %+ _X509_STORE_new %xdefine X509_STORE_set1_param BORINGSSL_PREFIX %+ _X509_STORE_set1_param -%xdefine X509_STORE_set_check_crl BORINGSSL_PREFIX %+ _X509_STORE_set_check_crl %xdefine X509_STORE_set_default_paths BORINGSSL_PREFIX %+ _X509_STORE_set_default_paths %xdefine X509_STORE_set_depth BORINGSSL_PREFIX %+ _X509_STORE_set_depth %xdefine X509_STORE_set_flags BORINGSSL_PREFIX %+ _X509_STORE_set_flags -%xdefine X509_STORE_set_get_crl BORINGSSL_PREFIX %+ _X509_STORE_set_get_crl %xdefine X509_STORE_set_purpose BORINGSSL_PREFIX %+ _X509_STORE_set_purpose %xdefine X509_STORE_set_trust BORINGSSL_PREFIX %+ _X509_STORE_set_trust %xdefine X509_STORE_set_verify_cb BORINGSSL_PREFIX %+ _X509_STORE_set_verify_cb @@ -5167,8 +5273,11 @@ %xdefine aes_hw_decrypt BORINGSSL_PREFIX %+ _aes_hw_decrypt %xdefine aes_hw_ecb_encrypt BORINGSSL_PREFIX %+ _aes_hw_ecb_encrypt %xdefine aes_hw_encrypt BORINGSSL_PREFIX %+ _aes_hw_encrypt +%xdefine aes_hw_encrypt_key_to_decrypt_key BORINGSSL_PREFIX %+ _aes_hw_encrypt_key_to_decrypt_key %xdefine aes_hw_set_decrypt_key BORINGSSL_PREFIX %+ _aes_hw_set_decrypt_key %xdefine aes_hw_set_encrypt_key BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key +%xdefine aes_hw_set_encrypt_key_alt BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key_alt +%xdefine aes_hw_set_encrypt_key_base BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key_base %xdefine aes_nohw_cbc_encrypt BORINGSSL_PREFIX %+ _aes_nohw_cbc_encrypt %xdefine aes_nohw_ctr32_encrypt_blocks BORINGSSL_PREFIX %+ _aes_nohw_ctr32_encrypt_blocks %xdefine aes_nohw_decrypt BORINGSSL_PREFIX %+ _aes_nohw_decrypt @@ -5240,19 +5349,22 @@ %xdefine bn_mont_ctx_set_RR_consttime BORINGSSL_PREFIX %+ _bn_mont_ctx_set_RR_consttime %xdefine bn_mont_n0 BORINGSSL_PREFIX %+ _bn_mont_n0 %xdefine bn_mul4x_mont BORINGSSL_PREFIX %+ _bn_mul4x_mont +%xdefine bn_mul4x_mont_gather5 BORINGSSL_PREFIX %+ _bn_mul4x_mont_gather5 %xdefine bn_mul_add_words BORINGSSL_PREFIX %+ _bn_mul_add_words %xdefine bn_mul_comba4 BORINGSSL_PREFIX %+ _bn_mul_comba4 %xdefine bn_mul_comba8 BORINGSSL_PREFIX %+ _bn_mul_comba8 %xdefine bn_mul_consttime BORINGSSL_PREFIX %+ _bn_mul_consttime %xdefine bn_mul_mont BORINGSSL_PREFIX %+ _bn_mul_mont -%xdefine bn_mul_mont_gather5 BORINGSSL_PREFIX %+ _bn_mul_mont_gather5 +%xdefine bn_mul_mont_gather5_nohw BORINGSSL_PREFIX %+ _bn_mul_mont_gather5_nohw %xdefine bn_mul_mont_nohw BORINGSSL_PREFIX %+ _bn_mul_mont_nohw %xdefine bn_mul_small BORINGSSL_PREFIX %+ _bn_mul_small %xdefine bn_mul_words BORINGSSL_PREFIX %+ _bn_mul_words %xdefine bn_mulx4x_mont BORINGSSL_PREFIX %+ _bn_mulx4x_mont +%xdefine bn_mulx4x_mont_gather5 BORINGSSL_PREFIX %+ _bn_mulx4x_mont_gather5 %xdefine bn_odd_number_is_obviously_composite BORINGSSL_PREFIX %+ _bn_odd_number_is_obviously_composite %xdefine bn_one_to_montgomery BORINGSSL_PREFIX %+ _bn_one_to_montgomery -%xdefine bn_power5 BORINGSSL_PREFIX %+ _bn_power5 +%xdefine bn_power5_nohw BORINGSSL_PREFIX %+ _bn_power5_nohw +%xdefine bn_powerx5 BORINGSSL_PREFIX %+ _bn_powerx5 %xdefine bn_rand_range_words BORINGSSL_PREFIX %+ _bn_rand_range_words %xdefine bn_rand_secret_range BORINGSSL_PREFIX %+ _bn_rand_secret_range %xdefine bn_reduce_once BORINGSSL_PREFIX %+ _bn_reduce_once @@ -5287,7 +5399,11 @@ %xdefine c2i_ASN1_INTEGER BORINGSSL_PREFIX %+ _c2i_ASN1_INTEGER %xdefine c2i_ASN1_OBJECT BORINGSSL_PREFIX %+ _c2i_ASN1_OBJECT %xdefine chacha20_poly1305_open BORINGSSL_PREFIX %+ _chacha20_poly1305_open +%xdefine chacha20_poly1305_open_avx2 BORINGSSL_PREFIX %+ _chacha20_poly1305_open_avx2 +%xdefine chacha20_poly1305_open_nohw BORINGSSL_PREFIX %+ _chacha20_poly1305_open_nohw %xdefine chacha20_poly1305_seal BORINGSSL_PREFIX %+ _chacha20_poly1305_seal +%xdefine chacha20_poly1305_seal_avx2 BORINGSSL_PREFIX %+ _chacha20_poly1305_seal_avx2 +%xdefine chacha20_poly1305_seal_nohw BORINGSSL_PREFIX %+ _chacha20_poly1305_seal_nohw %xdefine crypto_gcm_clmul_enabled BORINGSSL_PREFIX %+ _crypto_gcm_clmul_enabled %xdefine d2i_ASN1_BIT_STRING BORINGSSL_PREFIX %+ _d2i_ASN1_BIT_STRING %xdefine d2i_ASN1_BMPSTRING BORINGSSL_PREFIX %+ _d2i_ASN1_BMPSTRING @@ -5396,8 +5512,10 @@ %xdefine d2i_X509_VAL BORINGSSL_PREFIX %+ _d2i_X509_VAL %xdefine d2i_X509_bio BORINGSSL_PREFIX %+ _d2i_X509_bio %xdefine d2i_X509_fp BORINGSSL_PREFIX %+ _d2i_X509_fp +%xdefine dh_asn1_meth BORINGSSL_PREFIX %+ _dh_asn1_meth %xdefine dh_check_params_fast BORINGSSL_PREFIX %+ _dh_check_params_fast %xdefine dh_compute_key_padded_no_self_test BORINGSSL_PREFIX %+ _dh_compute_key_padded_no_self_test +%xdefine dh_pkey_meth BORINGSSL_PREFIX %+ _dh_pkey_meth %xdefine dsa_asn1_meth BORINGSSL_PREFIX %+ _dsa_asn1_meth %xdefine dsa_check_key BORINGSSL_PREFIX %+ _dsa_check_key %xdefine ec_GFp_mont_add BORINGSSL_PREFIX %+ _ec_GFp_mont_add @@ -5487,25 +5605,46 @@ %xdefine ec_set_to_safe_point BORINGSSL_PREFIX %+ _ec_set_to_safe_point %xdefine ec_simple_scalar_inv0_montgomery BORINGSSL_PREFIX %+ _ec_simple_scalar_inv0_montgomery %xdefine ec_simple_scalar_to_montgomery_inv_vartime BORINGSSL_PREFIX %+ _ec_simple_scalar_to_montgomery_inv_vartime -%xdefine ecdsa_do_verify_no_self_test BORINGSSL_PREFIX %+ _ecdsa_do_verify_no_self_test -%xdefine ecdsa_sign_with_nonce_for_known_answer_test BORINGSSL_PREFIX %+ _ecdsa_sign_with_nonce_for_known_answer_test -%xdefine ecp_nistz256_avx2_select_w7 BORINGSSL_PREFIX %+ _ecp_nistz256_avx2_select_w7 +%xdefine ecdsa_sign_fixed BORINGSSL_PREFIX %+ _ecdsa_sign_fixed +%xdefine ecdsa_sign_fixed_with_nonce_for_known_answer_test BORINGSSL_PREFIX %+ _ecdsa_sign_fixed_with_nonce_for_known_answer_test +%xdefine ecdsa_verify_fixed BORINGSSL_PREFIX %+ _ecdsa_verify_fixed +%xdefine ecdsa_verify_fixed_no_self_test BORINGSSL_PREFIX %+ _ecdsa_verify_fixed_no_self_test %xdefine ecp_nistz256_div_by_2 BORINGSSL_PREFIX %+ _ecp_nistz256_div_by_2 %xdefine ecp_nistz256_mul_by_2 BORINGSSL_PREFIX %+ _ecp_nistz256_mul_by_2 %xdefine ecp_nistz256_mul_by_3 BORINGSSL_PREFIX %+ _ecp_nistz256_mul_by_3 %xdefine ecp_nistz256_mul_mont BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont +%xdefine ecp_nistz256_mul_mont_adx BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont_adx +%xdefine ecp_nistz256_mul_mont_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont_nohw %xdefine ecp_nistz256_neg BORINGSSL_PREFIX %+ _ecp_nistz256_neg %xdefine ecp_nistz256_ord_mul_mont BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont +%xdefine ecp_nistz256_ord_mul_mont_adx BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont_adx +%xdefine ecp_nistz256_ord_mul_mont_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont_nohw %xdefine ecp_nistz256_ord_sqr_mont BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont +%xdefine ecp_nistz256_ord_sqr_mont_adx BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont_adx +%xdefine ecp_nistz256_ord_sqr_mont_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont_nohw %xdefine ecp_nistz256_point_add BORINGSSL_PREFIX %+ _ecp_nistz256_point_add +%xdefine ecp_nistz256_point_add_adx BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_adx %xdefine ecp_nistz256_point_add_affine BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine +%xdefine ecp_nistz256_point_add_affine_adx BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine_adx +%xdefine ecp_nistz256_point_add_affine_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine_nohw +%xdefine ecp_nistz256_point_add_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_nohw %xdefine ecp_nistz256_point_double BORINGSSL_PREFIX %+ _ecp_nistz256_point_double +%xdefine ecp_nistz256_point_double_adx BORINGSSL_PREFIX %+ _ecp_nistz256_point_double_adx +%xdefine ecp_nistz256_point_double_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_point_double_nohw %xdefine ecp_nistz256_select_w5 BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5 +%xdefine ecp_nistz256_select_w5_avx2 BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5_avx2 +%xdefine ecp_nistz256_select_w5_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5_nohw %xdefine ecp_nistz256_select_w7 BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7 +%xdefine ecp_nistz256_select_w7_avx2 BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7_avx2 +%xdefine ecp_nistz256_select_w7_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7_nohw %xdefine ecp_nistz256_sqr_mont BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont +%xdefine ecp_nistz256_sqr_mont_adx BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont_adx +%xdefine ecp_nistz256_sqr_mont_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont_nohw %xdefine ecp_nistz256_sub BORINGSSL_PREFIX %+ _ecp_nistz256_sub %xdefine ed25519_asn1_meth BORINGSSL_PREFIX %+ _ed25519_asn1_meth %xdefine ed25519_pkey_meth BORINGSSL_PREFIX %+ _ed25519_pkey_meth +%xdefine evp_md_md5_sha1 BORINGSSL_PREFIX %+ _evp_md_md5_sha1 +%xdefine evp_pkey_set_method BORINGSSL_PREFIX %+ _evp_pkey_set_method %xdefine fiat_curve25519_adx_mul BORINGSSL_PREFIX %+ _fiat_curve25519_adx_mul %xdefine fiat_curve25519_adx_square BORINGSSL_PREFIX %+ _fiat_curve25519_adx_square %xdefine fiat_p256_adx_mul BORINGSSL_PREFIX %+ _fiat_p256_adx_mul @@ -5747,8 +5886,6 @@ %xdefine spx_fors_sign BORINGSSL_PREFIX %+ _spx_fors_sign %xdefine spx_fors_sk_gen BORINGSSL_PREFIX %+ _spx_fors_sk_gen %xdefine spx_fors_treehash BORINGSSL_PREFIX %+ _spx_fors_treehash -%xdefine spx_generate_key BORINGSSL_PREFIX %+ _spx_generate_key -%xdefine spx_generate_key_from_seed BORINGSSL_PREFIX %+ _spx_generate_key_from_seed %xdefine spx_get_tree_index BORINGSSL_PREFIX %+ _spx_get_tree_index %xdefine spx_ht_sign BORINGSSL_PREFIX %+ _spx_ht_sign %xdefine spx_ht_verify BORINGSSL_PREFIX %+ _spx_ht_verify @@ -5760,7 +5897,6 @@ %xdefine spx_set_tree_height BORINGSSL_PREFIX %+ _spx_set_tree_height %xdefine spx_set_tree_index BORINGSSL_PREFIX %+ _spx_set_tree_index %xdefine spx_set_type BORINGSSL_PREFIX %+ _spx_set_type -%xdefine spx_sign BORINGSSL_PREFIX %+ _spx_sign %xdefine spx_thash_f BORINGSSL_PREFIX %+ _spx_thash_f %xdefine spx_thash_h BORINGSSL_PREFIX %+ _spx_thash_h %xdefine spx_thash_hmsg BORINGSSL_PREFIX %+ _spx_thash_hmsg @@ -5771,12 +5907,12 @@ %xdefine spx_to_uint64 BORINGSSL_PREFIX %+ _spx_to_uint64 %xdefine spx_treehash BORINGSSL_PREFIX %+ _spx_treehash %xdefine spx_uint64_to_len_bytes BORINGSSL_PREFIX %+ _spx_uint64_to_len_bytes -%xdefine spx_verify BORINGSSL_PREFIX %+ _spx_verify %xdefine spx_wots_pk_from_sig BORINGSSL_PREFIX %+ _spx_wots_pk_from_sig %xdefine spx_wots_pk_gen BORINGSSL_PREFIX %+ _spx_wots_pk_gen %xdefine spx_wots_sign BORINGSSL_PREFIX %+ _spx_wots_sign %xdefine spx_xmss_pk_from_sig BORINGSSL_PREFIX %+ _spx_xmss_pk_from_sig %xdefine spx_xmss_sign BORINGSSL_PREFIX %+ _spx_xmss_sign +%xdefine swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE BORINGSSL_PREFIX %+ _swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE %xdefine v2i_GENERAL_NAME BORINGSSL_PREFIX %+ _v2i_GENERAL_NAME %xdefine v2i_GENERAL_NAMES BORINGSSL_PREFIX %+ _v2i_GENERAL_NAMES %xdefine v2i_GENERAL_NAME_ex BORINGSSL_PREFIX %+ _v2i_GENERAL_NAME_ex diff --git a/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_dilithium.h b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_dilithium.h new file mode 100644 index 00000000..5abd7387 --- /dev/null +++ b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_dilithium.h @@ -0,0 +1,129 @@ +/* Copyright (c) 2023, Google LLC + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_DILITHIUM_H +#define OPENSSL_HEADER_DILITHIUM_H + +#include "CCryptoBoringSSL_base.h" + +#if defined(__cplusplus) +extern "C" { +#endif + + +#if defined(OPENSSL_UNSTABLE_EXPERIMENTAL_DILITHIUM) +// The ML-DSA spec has now been standardized and ML-DSA is available in +// BoringSSL. This code should no longer be used. It was intended for +// short-lived experiments and must not have been deployed anywhere durable. If +// you were using this you need to use the instead. This +// header and code will be removed from BoringSSL soon. + +// Dilithium3. + +// DILITHIUM_private_key contains a Dilithium3 private key. The contents of this +// object should never leave the address space since the format is unstable. +struct DILITHIUM_private_key { + union { + uint8_t bytes[32 + 32 + 64 + 256 * 4 * (5 + 6 + 6)]; + uint32_t alignment; + } opaque; +}; + +// DILITHIUM_public_key contains a Dilithium3 public key. The contents of this +// object should never leave the address space since the format is unstable. +struct DILITHIUM_public_key { + union { + uint8_t bytes[32 + 64 + 256 * 4 * 6]; + uint32_t alignment; + } opaque; +}; + +// DILITHIUM_PRIVATE_KEY_BYTES is the number of bytes in an encoded Dilithium3 +// private key. +#define DILITHIUM_PRIVATE_KEY_BYTES 4032 + +// DILITHIUM_PUBLIC_KEY_BYTES is the number of bytes in an encoded Dilithium3 +// public key. +#define DILITHIUM_PUBLIC_KEY_BYTES 1952 + +// DILITHIUM_SIGNATURE_BYTES is the number of bytes in an encoded Dilithium3 +// signature. +#define DILITHIUM_SIGNATURE_BYTES 3309 + +// DILITHIUM_generate_key generates a random public/private key pair, writes the +// encoded public key to |out_encoded_public_key| and sets |out_private_key| to +// the private key. Returns 1 on success and 0 on failure. +OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_generate_key( + uint8_t out_encoded_public_key[DILITHIUM_PUBLIC_KEY_BYTES], + struct DILITHIUM_private_key *out_private_key); + +// DILITHIUM_public_from_private sets |*out_public_key| to the public key that +// corresponds to |private_key|. Returns 1 on success and 0 on failure. +OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_public_from_private( + struct DILITHIUM_public_key *out_public_key, + const struct DILITHIUM_private_key *private_key); + +// DILITHIUM_sign generates a signature for the message |msg| of length +// |msg_len| using |private_key| following the randomized algorithm, and writes +// the encoded signature to |out_encoded_signature|. Returns 1 on success and 0 +// on failure. +OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_sign( + uint8_t out_encoded_signature[DILITHIUM_SIGNATURE_BYTES], + const struct DILITHIUM_private_key *private_key, const uint8_t *msg, + size_t msg_len); + +// DILITHIUM_verify verifies that |encoded_signature| constitutes a valid +// signature for the message |msg| of length |msg_len| using |public_key|. +OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_verify( + const struct DILITHIUM_public_key *public_key, + const uint8_t encoded_signature[DILITHIUM_SIGNATURE_BYTES], + const uint8_t *msg, size_t msg_len); + + +// Serialisation of keys. + +// DILITHIUM_marshal_public_key serializes |public_key| to |out| in the standard +// format for Dilithium public keys. It returns one on success or zero on +// allocation error. +OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_marshal_public_key( + CBB *out, const struct DILITHIUM_public_key *public_key); + +// DILITHIUM_parse_public_key parses a public key, in the format generated by +// |DILITHIUM_marshal_public_key|, from |in| and writes the result to +// |out_public_key|. It returns one on success or zero on parse error or if +// there are trailing bytes in |in|. +OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_parse_public_key( + struct DILITHIUM_public_key *public_key, CBS *in); + +// DILITHIUM_marshal_private_key serializes |private_key| to |out| in the +// standard format for Dilithium private keys. It returns one on success or zero +// on allocation error. +OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_marshal_private_key( + CBB *out, const struct DILITHIUM_private_key *private_key); + +// DILITHIUM_parse_private_key parses a private key, in the format generated by +// |DILITHIUM_marshal_private_key|, from |in| and writes the result to +// |out_private_key|. It returns one on success or zero on parse error or if +// there are trailing bytes in |in|. +OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_parse_private_key( + struct DILITHIUM_private_key *private_key, CBS *in); + +#endif // OPENSSL_UNSTABLE_EXPERIMENTAL_DILITHIUM + + +#if defined(__cplusplus) +} // extern C +#endif + +#endif // OPENSSL_HEADER_DILITHIUM_H diff --git a/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_kyber.h b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_kyber.h index aba5f4b5..b9c969f9 100644 --- a/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_kyber.h +++ b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_kyber.h @@ -22,6 +22,14 @@ extern "C" { #endif +#if defined(OPENSSL_UNSTABLE_EXPERIMENTAL_KYBER) +// This header implements experimental, draft versions of not-yet-standardized +// primitives. When the standard is complete, these functions will be removed +// and replaced with the final, incompatible standard version. They are +// available now for short-lived experiments, but must not be deployed anywhere +// durable, such as a long-lived key store. To use these functions define +// OPENSSL_UNSTABLE_EXPERIMENTAL_KYBER + // Kyber768. // // This implements the round-3 specification of Kyber, defined at @@ -128,6 +136,8 @@ OPENSSL_EXPORT int KYBER_marshal_private_key( OPENSSL_EXPORT int KYBER_parse_private_key( struct KYBER_private_key *out_private_key, CBS *in); +#endif // OPENSSL_UNSTABLE_EXPERIMENTAL_KYBER + #if defined(__cplusplus) } // extern C diff --git a/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_spx.h b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_spx.h index 4ba38833..798831c2 100644 --- a/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_spx.h +++ b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_spx.h @@ -22,6 +22,14 @@ extern "C" { #endif +#if defined(OPENSSL_UNSTABLE_EXPERIMENTAL_SPX) +// This header implements experimental, draft versions of not-yet-standardized +// primitives. When the standard is complete, these functions will be removed +// and replaced with the final, incompatible standard version. They are +// available now for short-lived experiments, but must not be deployed anywhere +// durable, such as a long-lived key store. To use these functions define +// OPENSSL_UNSTABLE_EXPERIMENTAL_SPX + // SPX_N is the number of bytes in the hash output #define SPX_N 16 @@ -37,39 +45,42 @@ extern "C" { // SPHINCS+-SHA2-128s #define SPX_SIGNATURE_BYTES 7856 -// spx_generate_key generates a SPHINCS+-SHA2-128s key pair and writes the +// SPX_generate_key generates a SPHINCS+-SHA2-128s key pair and writes the // result to |out_public_key| and |out_secret_key|. // Private key: SK.seed || SK.prf || PK.seed || PK.root // Public key: PK.seed || PK.root -OPENSSL_EXPORT void spx_generate_key( +OPENSSL_EXPORT void SPX_generate_key( uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES], uint8_t out_secret_key[SPX_SECRET_KEY_BYTES]); -// spx_generate_key_from_seed generates a SPHINCS+-SHA2-128s key pair from a +// SPX_generate_key_from_seed generates a SPHINCS+-SHA2-128s key pair from a // 48-byte seed and writes the result to |out_public_key| and |out_secret_key|. // Secret key: SK.seed || SK.prf || PK.seed || PK.root // Public key: PK.seed || PK.root -OPENSSL_EXPORT void spx_generate_key_from_seed( +OPENSSL_EXPORT void SPX_generate_key_from_seed( uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES], uint8_t out_secret_key[SPX_SECRET_KEY_BYTES], const uint8_t seed[3 * SPX_N]); -// spx_sign generates a SPHINCS+-SHA2-128s signature over |msg| or length +// SPX_sign generates a SPHINCS+-SHA2-128s signature over |msg| or length // |msg_len| using |secret_key| and writes the output to |out_signature|. // // if |randomized| is 0, deterministic signing is performed, otherwise, // non-deterministic signing is performed. -OPENSSL_EXPORT void spx_sign(uint8_t out_snignature[SPX_SIGNATURE_BYTES], - const uint8_t secret_key[SPX_SECRET_KEY_BYTES], - const uint8_t *msg, size_t msg_len, - int randomized); +OPENSSL_EXPORT void SPX_sign( + uint8_t out_snignature[SPX_SIGNATURE_BYTES], + const uint8_t secret_key[SPX_SECRET_KEY_BYTES], const uint8_t *msg, + size_t msg_len, int randomized); -// spx_verify verifies a SPHINCS+-SHA2-128s signature in |signature| over |msg| +// SPX_verify verifies a SPHINCS+-SHA2-128s signature in |signature| over |msg| // or length |msg_len| using |public_key|. 1 is returned if the signature // matches, 0 otherwise. -OPENSSL_EXPORT int spx_verify(const uint8_t signature[SPX_SIGNATURE_BYTES], - const uint8_t public_key[SPX_SECRET_KEY_BYTES], - const uint8_t *msg, size_t msg_len); +OPENSSL_EXPORT int SPX_verify( + const uint8_t signature[SPX_SIGNATURE_BYTES], + const uint8_t public_key[SPX_SECRET_KEY_BYTES], const uint8_t *msg, + size_t msg_len); + +#endif //OPENSSL_UNSTABLE_EXPERIMENTAL_SPX #if defined(__cplusplus) diff --git a/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_mul.S b/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_mul.S index 4fa322fb..a8b18dfb 100644 --- a/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_mul.S +++ b/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_mul.S @@ -4,7 +4,6 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ (defined(__APPLE__) || defined(__ELF__)) -.intel_syntax noprefix .text #if defined(__APPLE__) .private_extern _fiat_p256_adx_mul @@ -19,158 +18,158 @@ fiat_p256_adx_mul: .cfi_startproc _CET_ENDBR -push rbp -.cfi_adjust_cfa_offset 8 +pushq %rbp +;.cfi_adjust_cfa_offset 8 .cfi_offset rbp, -16 -mov rbp, rsp -mov rax, rdx -mov rdx, [ rsi + 0x0 ] -test al, al -mulx r8, rcx, [ rax + 0x0 ] -mov [ rsp - 0x80 ], rbx +movq %rsp, %rbp +movq %rdx, %rax +movq (%rsi), %rdx +testb %al, %al +mulxq (%rax), %rcx, %r8 +movq %rbx, -0x80(%rsp) .cfi_offset rbx, -16-0x80 -mulx rbx, r9, [ rax + 0x8 ] -mov [ rsp - 0x68 ], r14 +mulxq 0x8(%rax), %r9, %rbx +movq %r14, -0x68(%rsp) .cfi_offset r14, -16-0x68 -adc r9, r8 -mov [ rsp - 0x60 ], r15 +adcq %r8, %r9 +movq %r15, -0x60(%rsp) .cfi_offset r15, -16-0x60 -mulx r15, r14, [ rax + 0x10 ] -mov [ rsp - 0x78 ], r12 +mulxq 0x10(%rax), %r14, %r15 +movq %r12, -0x78(%rsp) .cfi_offset r12, -16-0x78 -adc r14, rbx -mulx r11, r10, [ rax + 0x18 ] -mov [ rsp - 0x70 ], r13 +adcq %rbx, %r14 +mulxq 0x18(%rax), %r10, %r11 +movq %r13, -0x70(%rsp) .cfi_offset r13, -16-0x70 -adc r10, r15 -mov rdx, [ rsi + 0x8 ] -mulx rbx, r8, [ rax + 0x0 ] -adc r11, 0x0 -xor r15, r15 -adcx r8, r9 -adox rbx, r14 -mov [ rsp - 0x58 ], rdi -mulx rdi, r9, [ rax + 0x8 ] -adcx r9, rbx -adox rdi, r10 -mulx rbx, r14, [ rax + 0x10 ] -adcx r14, rdi -adox rbx, r11 -mulx r13, r12, [ rax + 0x18 ] -adcx r12, rbx -mov rdx, 0x100000000 -mulx r11, r10, rcx -adox r13, r15 -adcx r13, r15 -xor rdi, rdi -adox r10, r8 -mulx r8, rbx, r10 -adox r11, r9 -adcx rbx, r11 -adox r8, r14 -mov rdx, 0xffffffff00000001 -mulx r9, r15, rcx -adcx r15, r8 -adox r9, r12 -mulx r14, rcx, r10 -mov rdx, [ rsi + 0x10 ] -mulx r10, r12, [ rax + 0x8 ] -adcx rcx, r9 -adox r14, r13 -mulx r11, r13, [ rax + 0x0 ] -mov r9, rdi -adcx r14, r9 -adox rdi, rdi -adc rdi, 0x0 -xor r9, r9 -adcx r13, rbx -adox r11, r15 -mov rdx, [ rsi + 0x10 ] -mulx r15, r8, [ rax + 0x10 ] -adox r10, rcx -mulx rcx, rbx, [ rax + 0x18 ] -mov rdx, [ rsi + 0x18 ] -adcx r12, r11 -mulx rsi, r11, [ rax + 0x8 ] -adcx r8, r10 -adox r15, r14 -adcx rbx, r15 -adox rcx, r9 -adcx rcx, r9 -mulx r15, r10, [ rax + 0x0 ] -add rcx, rdi -mov r14, r9 -adc r14, 0 -xor r9, r9 -adcx r10, r12 -adox r15, r8 -adcx r11, r15 -adox rsi, rbx -mulx r8, r12, [ rax + 0x10 ] -adox r8, rcx -mulx rcx, rbx, [ rax + 0x18 ] -adcx r12, rsi -adox rcx, r9 -mov rdx, 0x100000000 -adcx rbx, r8 -adc rcx, 0 -mulx rdi, r15, r13 -xor rax, rax -adcx rcx, r14 -adc rax, 0 -xor r9, r9 -adox r15, r10 -mulx r14, r10, r15 -adox rdi, r11 -mov rdx, 0xffffffff00000001 -adox r14, r12 -adcx r10, rdi -mulx r12, r11, r13 -adcx r11, r14 -adox r12, rbx -mulx rbx, r13, r15 -adcx r13, r12 -adox rbx, rcx -mov r8, r9 -adox rax, r9 -adcx r8, rbx -adc rax, 0x0 -mov rcx, rax -mov r15, 0xffffffffffffffff -mov rdi, r10 -sub rdi, r15 -mov r14, 0xffffffff -mov r12, r11 -sbb r12, r14 -mov rbx, r13 -sbb rbx, r9 -mov rax, rax -mov rax, r8 -sbb rax, rdx -sbb rcx, r9 -cmovc rdi, r10 -mov r10, [ rsp - 0x58 ] -cmovc rbx, r13 -mov r13, [ rsp - 0x70 ] +adcq %r15, %r10 +movq 0x8(%rsi), %rdx +mulxq (%rax), %r8, %rbx +adcq $0x0, %r11 +xorq %r15, %r15 +adcxq %r9, %r8 +adoxq %r14, %rbx +movq %rdi, -0x58(%rsp) +mulxq 0x8(%rax), %r9, %rdi +adcxq %rbx, %r9 +adoxq %r10, %rdi +mulxq 0x10(%rax), %r14, %rbx +adcxq %rdi, %r14 +adoxq %r11, %rbx +mulxq 0x18(%rax), %r12, %r13 +adcxq %rbx, %r12 +movq $0x100000000, %rdx +mulxq %rcx, %r10, %r11 +adoxq %r15, %r13 +adcxq %r15, %r13 +xorq %rdi, %rdi +adoxq %r8, %r10 +mulxq %r10, %rbx, %r8 +adoxq %r9, %r11 +adcxq %r11, %rbx +adoxq %r14, %r8 +movq $0xffffffff00000001, %rdx +mulxq %rcx, %r15, %r9 +adcxq %r8, %r15 +adoxq %r12, %r9 +mulxq %r10, %rcx, %r14 +movq 0x10(%rsi), %rdx +mulxq 0x8(%rax), %r12, %r10 +adcxq %r9, %rcx +adoxq %r13, %r14 +mulxq (%rax), %r13, %r11 +movq %rdi, %r9 +adcxq %r9, %r14 +adoxq %rdi, %rdi +adcq $0x0, %rdi +xorq %r9, %r9 +adcxq %rbx, %r13 +adoxq %r15, %r11 +movq 0x10(%rsi), %rdx +mulxq 0x10(%rax), %r8, %r15 +adoxq %rcx, %r10 +mulxq 0x18(%rax), %rbx, %rcx +movq 0x18(%rsi), %rdx +adcxq %r11, %r12 +mulxq 0x8(%rax), %r11, %rsi +adcxq %r10, %r8 +adoxq %r14, %r15 +adcxq %r15, %rbx +adoxq %r9, %rcx +adcxq %r9, %rcx +mulxq (%rax), %r10, %r15 +addq %rdi, %rcx +movq %r9, %r14 +adcq $0x0, %r14 +xorq %r9, %r9 +adcxq %r12, %r10 +adoxq %r8, %r15 +adcxq %r15, %r11 +adoxq %rbx, %rsi +mulxq 0x10(%rax), %r12, %r8 +adoxq %rcx, %r8 +mulxq 0x18(%rax), %rbx, %rcx +adcxq %rsi, %r12 +adoxq %r9, %rcx +movq $0x100000000, %rdx +adcxq %r8, %rbx +adcq $0x0, %rcx +mulxq %r13, %r15, %rdi +xorq %rax, %rax +adcxq %r14, %rcx +adcq $0x0, %rax +xorq %r9, %r9 +adoxq %r10, %r15 +mulxq %r15, %r10, %r14 +adoxq %r11, %rdi +movq $0xffffffff00000001, %rdx +adoxq %r12, %r14 +adcxq %rdi, %r10 +mulxq %r13, %r11, %r12 +adcxq %r14, %r11 +adoxq %rbx, %r12 +mulxq %r15, %r13, %rbx +adcxq %r12, %r13 +adoxq %rcx, %rbx +movq %r9, %r8 +adoxq %r9, %rax +adcxq %rbx, %r8 +adcq $0x0, %rax +movq %rax, %rcx +movq $0xffffffffffffffff, %r15 +movq %r10, %rdi +subq %r15, %rdi +movq $0xffffffff, %r14 +movq %r11, %r12 +sbbq %r14, %r12 +movq %r13, %rbx +sbbq %r9, %rbx +movq %rax, %rax +movq %r8, %rax +sbbq %rdx, %rax +sbbq %r9, %rcx +cmovcq %r10, %rdi +movq -0x58(%rsp), %r10 +cmovcq %r13, %rbx +movq -0x70(%rsp), %r13 .cfi_restore r13 -cmovc r12, r11 -cmovc rax, r8 -mov [ r10 + 0x10 ], rbx -mov rbx, [ rsp - 0x80 ] +cmovcq %r11, %r12 +cmovcq %r8, %rax +movq %rbx, 0x10(%r10) +movq -0x80(%rsp), %rbx .cfi_restore rbx -mov [ r10 + 0x0 ], rdi -mov [ r10 + 0x8 ], r12 -mov [ r10 + 0x18 ], rax -mov r12, [ rsp - 0x78 ] +movq %rdi, (%r10) +movq %r12, 0x8(%r10) +movq %rax, 0x18(%r10) +movq -0x78(%rsp), %r12 .cfi_restore r12 -mov r14, [ rsp - 0x68 ] +movq -0x68(%rsp), %r14 .cfi_restore r14 -mov r15, [ rsp - 0x60 ] +movq -0x60(%rsp), %r15 .cfi_restore r15 -pop rbp +popq %rbp .cfi_restore rbp .cfi_adjust_cfa_offset -8 -ret +retq .cfi_endproc #if defined(__ELF__) .size fiat_p256_adx_mul, .-fiat_p256_adx_mul diff --git a/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_sqr.S b/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_sqr.S index ee6da27a..3ee05295 100644 --- a/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_sqr.S +++ b/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_sqr.S @@ -4,7 +4,6 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ (defined(__APPLE__) || defined(__ELF__)) -.intel_syntax noprefix .text #if defined(__APPLE__) .private_extern _fiat_p256_adx_sqr @@ -19,147 +18,147 @@ fiat_p256_adx_sqr: .cfi_startproc _CET_ENDBR -push rbp +pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset rbp, -16 -mov rbp, rsp -mov rdx, [ rsi + 0x0 ] -mulx r10, rax, [ rsi + 0x18 ] -mulx rcx, r11, rdx -mulx r9, r8, [ rsi + 0x8 ] -mov [ rsp - 0x80 ], rbx +movq %rsp, %rbp +movq (%rsi), %rdx +mulxq 0x18(%rsi), %rax, %r10 +mulxq %rdx, %r11, %rcx +mulxq 0x8(%rsi), %r8, %r9 +movq %rbx, -0x80(%rsp) .cfi_offset rbx, -16-0x80 -xor rbx, rbx -adox r8, r8 -mov [ rsp - 0x78 ], r12 +xorq %rbx, %rbx +adoxq %r8, %r8 +movq %r12, -0x78(%rsp) .cfi_offset r12, -16-0x78 -mulx r12, rbx, [ rsi + 0x10 ] -mov rdx, [ rsi + 0x8 ] -mov [ rsp - 0x70 ], r13 +mulxq 0x10(%rsi), %rbx, %r12 +movq 0x8(%rsi), %rdx +movq %r13, -0x70(%rsp) .cfi_offset r13, -16-0x70 -mov [ rsp - 0x68 ], r14 +movq %r14, -0x68(%rsp) .cfi_offset r14, -16-0x68 -mulx r14, r13, rdx -mov [ rsp - 0x60 ], r15 +mulxq %rdx, %r13, %r14 +movq %r15, -0x60(%rsp) .cfi_offset r15, -16-0x60 -mov [ rsp - 0x58 ], rdi -mulx rdi, r15, [ rsi + 0x10 ] -adcx r12, r15 -mov [ rsp - 0x50 ], r11 -mulx r11, r15, [ rsi + 0x18 ] -adcx r10, rdi -mov rdi, 0x0 -adcx r11, rdi +movq %rdi, -0x58(%rsp) +mulxq 0x10(%rsi), %r15, %rdi +adcxq %r15, %r12 +movq %r11, -0x50(%rsp) +mulxq 0x18(%rsi), %r15, %r11 +adcxq %rdi, %r10 +movq $0x0, %rdi +adcxq %rdi, %r11 clc -adcx rbx, r9 -adox rbx, rbx -adcx rax, r12 -adox rax, rax -adcx r15, r10 -adox r15, r15 -mov rdx, [ rsi + 0x10 ] -mulx r12, r9, [ rsi + 0x18 ] -adcx r9, r11 -adcx r12, rdi -mulx r11, r10, rdx +adcxq %r9, %rbx +adoxq %rbx, %rbx +adcxq %r12, %rax +adoxq %rax, %rax +adcxq %r10, %r15 +adoxq %r15, %r15 +movq 0x10(%rsi), %rdx +mulxq 0x18(%rsi), %r9, %r12 +adcxq %r11, %r9 +adcxq %rdi, %r12 +mulxq %rdx, %r10, %r11 clc -adcx rcx, r8 -adcx r13, rbx -adcx r14, rax -adox r9, r9 -adcx r10, r15 -mov rdx, [ rsi + 0x18 ] -mulx rbx, r8, rdx -adox r12, r12 -adcx r11, r9 -mov rsi, [ rsp - 0x50 ] -adcx r8, r12 -mov rax, 0x100000000 -mov rdx, rax -mulx r15, rax, rsi -adcx rbx, rdi -adox rbx, rdi -xor r9, r9 -adox rax, rcx -adox r15, r13 -mulx rcx, rdi, rax -adcx rdi, r15 -adox rcx, r14 -mov rdx, 0xffffffff00000001 -mulx r14, r13, rsi -adox r14, r10 -adcx r13, rcx -mulx r12, r10, rax -adox r12, r11 -mov r11, r9 -adox r11, r8 -adcx r10, r14 -mov r8, r9 -adcx r8, r12 -mov rax, r9 -adcx rax, r11 -mov r15, r9 -adox r15, rbx -mov rdx, 0x100000000 -mulx rcx, rbx, rdi -mov r14, r9 -adcx r14, r15 -mov r12, r9 -adox r12, r12 -adcx r12, r9 -adox rbx, r13 -mulx r11, r13, rbx -mov r15, 0xffffffff00000001 -mov rdx, r15 -mulx rsi, r15, rbx -adox rcx, r10 -adox r11, r8 -mulx r8, r10, rdi -adcx r13, rcx -adox r8, rax -adcx r10, r11 -adox rsi, r14 -mov rdi, r12 -mov rax, r9 -adox rdi, rax -adcx r15, r8 -mov r14, rax -adcx r14, rsi -adcx rdi, r9 -dec r9 -mov rbx, r13 -sub rbx, r9 -mov rcx, 0xffffffff -mov r11, r10 -sbb r11, rcx -mov r8, r15 -sbb r8, rax -mov rsi, r14 -sbb rsi, rdx -sbb rdi, rax -cmovc rbx, r13 -cmovc r8, r15 -cmovc r11, r10 -cmovc rsi, r14 -mov rdi, [ rsp - 0x58 ] -mov [ rdi + 0x18 ], rsi -mov [ rdi + 0x0 ], rbx -mov [ rdi + 0x8 ], r11 -mov [ rdi + 0x10 ], r8 -mov rbx, [ rsp - 0x80 ] +adcxq %r8, %rcx +adcxq %rbx, %r13 +adcxq %rax, %r14 +adoxq %r9, %r9 +adcxq %r15, %r10 +movq 0x18(%rsi), %rdx +mulxq %rdx, %r8, %rbx +adoxq %r12, %r12 +adcxq %r9, %r11 +movq -0x50(%rsp), %rsi +adcxq %r12, %r8 +movq $0x100000000, %rax +movq %rax, %rdx +mulxq %rsi, %rax, %r15 +adcxq %rdi, %rbx +adoxq %rdi, %rbx +xorq %r9, %r9 +adoxq %rcx, %rax +adoxq %r13, %r15 +mulxq %rax, %rdi, %rcx +adcxq %r15, %rdi +adoxq %r14, %rcx +movq $0xffffffff00000001, %rdx +mulxq %rsi, %r13, %r14 +adoxq %r10, %r14 +adcxq %rcx, %r13 +mulxq %rax, %r10, %r12 +adoxq %r11, %r12 +movq %r9, %r11 +adoxq %r8, %r11 +adcxq %r14, %r10 +movq %r9, %r8 +adcxq %r12, %r8 +movq %r9, %rax +adcxq %r11, %rax +movq %r9, %r15 +adoxq %rbx, %r15 +movq $0x100000000, %rdx +mulxq %rdi, %rbx, %rcx +movq %r9, %r14 +adcxq %r15, %r14 +movq %r9, %r12 +adoxq %r12, %r12 +adcxq %r9, %r12 +adoxq %r13, %rbx +mulxq %rbx, %r13, %r11 +movq $0xffffffff00000001, %r15 +movq %r15, %rdx +mulxq %rbx, %r15, %rsi +adoxq %r10, %rcx +adoxq %r8, %r11 +mulxq %rdi, %r10, %r8 +adcxq %rcx, %r13 +adoxq %rax, %r8 +adcxq %r11, %r10 +adoxq %r14, %rsi +movq %r12, %rdi +movq %r9, %rax +adoxq %rax, %rdi +adcxq %r8, %r15 +movq %rax, %r14 +adcxq %rsi, %r14 +adcxq %r9, %rdi +decq %r9 +movq %r13, %rbx +subq %r9, %rbx +movq $0xffffffff, %rcx +movq %r10, %r11 +sbbq %rcx, %r11 +movq %r15, %r8 +sbbq %rax, %r8 +movq %r14, %rsi +sbbq %rdx, %rsi +sbbq %rax, %rdi +cmovcq %r13, %rbx +cmovcq %r15, %r8 +cmovcq %r10, %r11 +cmovcq %r14, %rsi +movq -0x58(%rsp), %rdi +movq %rsi, 0x18(%rdi) +movq %rbx, (%rdi) +movq %r11, 0x8(%rdi) +movq %r8, 0x10(%rdi) +movq -0x80(%rsp), %rbx .cfi_restore rbx -mov r12, [ rsp - 0x78 ] +movq -0x78(%rsp), %r12 .cfi_restore r12 -mov r13, [ rsp - 0x70 ] +movq -0x70(%rsp), %r13 .cfi_restore r13 -mov r14, [ rsp - 0x68 ] +movq -0x68(%rsp), %r14 .cfi_restore r14 -mov r15, [ rsp - 0x60 ] +movq -0x60(%rsp), %r15 .cfi_restore r15 -pop rbp +popq %rbp .cfi_restore rbp .cfi_adjust_cfa_offset -8 -ret +retq .cfi_endproc #if defined(__ELF__) .size fiat_p256_adx_sqr, .-fiat_p256_adx_sqr