diff --git a/Package.swift b/Package.swift
index 1a0bcb67..11134382 100644
--- a/Package.swift
+++ b/Package.swift
@@ -20,7 +20,7 @@
 // Sources/CCryptoBoringSSL directory. The source repository is at
 // https://boringssl.googlesource.com/boringssl.
 //
-// BoringSSL Commit: dbad745811195c00b729efd0ee0a09b7d9fce1d2
+// BoringSSL Commit: 6a2ccdcc2ed1d37a43a2183658d2ae61fd5ce208
 
 import PackageDescription
 
diff --git a/Sources/CCryptoBoringSSL/CMakeLists.txt b/Sources/CCryptoBoringSSL/CMakeLists.txt
index cfbb1ef3..8a7ff4c8 100644
--- a/Sources/CCryptoBoringSSL/CMakeLists.txt
+++ b/Sources/CCryptoBoringSSL/CMakeLists.txt
@@ -91,6 +91,7 @@ add_library(CCryptoBoringSSL STATIC
   "crypto/dh_extra/dh_asn1.c"
   "crypto/dh_extra/params.c"
   "crypto/digest_extra/digest_extra.c"
+  "crypto/dilithium/dilithium.c"
   "crypto/dsa/dsa.c"
   "crypto/dsa/dsa_asn1.c"
   "crypto/ec_extra/ec_asn1.c"
@@ -100,10 +101,11 @@ add_library(CCryptoBoringSSL STATIC
   "crypto/ecdsa_extra/ecdsa_asn1.c"
   "crypto/engine/engine.c"
   "crypto/err/err.c"
-  "crypto/err/err_data.c"
   "crypto/evp/evp.c"
   "crypto/evp/evp_asn1.c"
   "crypto/evp/evp_ctx.c"
+  "crypto/evp/p_dh.c"
+  "crypto/evp/p_dh_asn1.c"
   "crypto/evp/p_dsa_asn1.c"
   "crypto/evp/p_ec.c"
   "crypto/evp/p_ec_asn1.c"
@@ -119,89 +121,17 @@ add_library(CCryptoBoringSSL STATIC
   "crypto/evp/scrypt.c"
   "crypto/evp/sign.c"
   "crypto/ex_data.c"
-  "crypto/fipsmodule/aes/aes.c"
-  "crypto/fipsmodule/aes/aes_nohw.c"
-  "crypto/fipsmodule/aes/key_wrap.c"
-  "crypto/fipsmodule/aes/mode_wrappers.c"
-  "crypto/fipsmodule/bn/add.c"
-  "crypto/fipsmodule/bn/asm/x86_64-gcc.c"
-  "crypto/fipsmodule/bn/bn.c"
-  "crypto/fipsmodule/bn/bytes.c"
-  "crypto/fipsmodule/bn/cmp.c"
-  "crypto/fipsmodule/bn/ctx.c"
-  "crypto/fipsmodule/bn/div.c"
-  "crypto/fipsmodule/bn/div_extra.c"
-  "crypto/fipsmodule/bn/exponentiation.c"
-  "crypto/fipsmodule/bn/gcd.c"
-  "crypto/fipsmodule/bn/gcd_extra.c"
-  "crypto/fipsmodule/bn/generic.c"
-  "crypto/fipsmodule/bn/jacobi.c"
-  "crypto/fipsmodule/bn/montgomery.c"
-  "crypto/fipsmodule/bn/montgomery_inv.c"
-  "crypto/fipsmodule/bn/mul.c"
-  "crypto/fipsmodule/bn/prime.c"
-  "crypto/fipsmodule/bn/random.c"
-  "crypto/fipsmodule/bn/rsaz_exp.c"
-  "crypto/fipsmodule/bn/shift.c"
-  "crypto/fipsmodule/bn/sqrt.c"
-  "crypto/fipsmodule/cipher/aead.c"
-  "crypto/fipsmodule/cipher/cipher.c"
-  "crypto/fipsmodule/cipher/e_aes.c"
-  "crypto/fipsmodule/cipher/e_aesccm.c"
-  "crypto/fipsmodule/cmac/cmac.c"
-  "crypto/fipsmodule/dh/check.c"
-  "crypto/fipsmodule/dh/dh.c"
-  "crypto/fipsmodule/digest/digest.c"
-  "crypto/fipsmodule/digest/digests.c"
-  "crypto/fipsmodule/digestsign/digestsign.c"
-  "crypto/fipsmodule/ec/ec.c"
-  "crypto/fipsmodule/ec/ec_key.c"
-  "crypto/fipsmodule/ec/ec_montgomery.c"
-  "crypto/fipsmodule/ec/felem.c"
-  "crypto/fipsmodule/ec/oct.c"
-  "crypto/fipsmodule/ec/p224-64.c"
-  "crypto/fipsmodule/ec/p256-nistz.c"
-  "crypto/fipsmodule/ec/p256.c"
-  "crypto/fipsmodule/ec/scalar.c"
-  "crypto/fipsmodule/ec/simple.c"
-  "crypto/fipsmodule/ec/simple_mul.c"
-  "crypto/fipsmodule/ec/util.c"
-  "crypto/fipsmodule/ec/wnaf.c"
-  "crypto/fipsmodule/ecdh/ecdh.c"
-  "crypto/fipsmodule/ecdsa/ecdsa.c"
+  "crypto/fipsmodule/bcm.c"
   "crypto/fipsmodule/fips_shared_support.c"
-  "crypto/fipsmodule/hkdf/hkdf.c"
-  "crypto/fipsmodule/hmac/hmac.c"
-  "crypto/fipsmodule/md4/md4.c"
-  "crypto/fipsmodule/md5/md5.c"
-  "crypto/fipsmodule/modes/cbc.c"
-  "crypto/fipsmodule/modes/cfb.c"
-  "crypto/fipsmodule/modes/ctr.c"
-  "crypto/fipsmodule/modes/gcm.c"
-  "crypto/fipsmodule/modes/gcm_nohw.c"
-  "crypto/fipsmodule/modes/ofb.c"
-  "crypto/fipsmodule/modes/polyval.c"
-  "crypto/fipsmodule/rand/ctrdrbg.c"
-  "crypto/fipsmodule/rand/fork_detect.c"
-  "crypto/fipsmodule/rand/rand.c"
-  "crypto/fipsmodule/rand/urandom.c"
-  "crypto/fipsmodule/rsa/blinding.c"
-  "crypto/fipsmodule/rsa/padding.c"
-  "crypto/fipsmodule/rsa/rsa.c"
-  "crypto/fipsmodule/rsa/rsa_impl.c"
-  "crypto/fipsmodule/self_check/fips.c"
-  "crypto/fipsmodule/self_check/self_check.c"
-  "crypto/fipsmodule/service_indicator/service_indicator.c"
-  "crypto/fipsmodule/sha/sha1.c"
-  "crypto/fipsmodule/sha/sha256.c"
-  "crypto/fipsmodule/sha/sha512.c"
-  "crypto/fipsmodule/tls/kdf.c"
   "crypto/hpke/hpke.c"
   "crypto/hrss/hrss.c"
   "crypto/keccak/keccak.c"
   "crypto/kyber/kyber.c"
   "crypto/lhash/lhash.c"
+  "crypto/md4/md4.c"
+  "crypto/md5/md5.c"
   "crypto/mem.c"
+  "crypto/mldsa/mldsa.c"
   "crypto/obj/obj.c"
   "crypto/obj/obj_xref.c"
   "crypto/pem/pem_all.c"
@@ -222,26 +152,29 @@ add_library(CCryptoBoringSSL STATIC
   "crypto/poly1305/poly1305_vec.c"
   "crypto/pool/pool.c"
   "crypto/rand_extra/deterministic.c"
+  "crypto/rand_extra/fork_detect.c"
   "crypto/rand_extra/forkunsafe.c"
   "crypto/rand_extra/getentropy.c"
   "crypto/rand_extra/ios.c"
   "crypto/rand_extra/passive.c"
   "crypto/rand_extra/rand_extra.c"
   "crypto/rand_extra/trusty.c"
+  "crypto/rand_extra/urandom.c"
   "crypto/rand_extra/windows.c"
   "crypto/rc4/rc4.c"
   "crypto/refcount.c"
   "crypto/rsa_extra/rsa_asn1.c"
   "crypto/rsa_extra/rsa_crypt.c"
   "crypto/rsa_extra/rsa_print.c"
+  "crypto/sha/sha1.c"
   "crypto/siphash/siphash.c"
-  "crypto/spx/address.c"
-  "crypto/spx/fors.c"
-  "crypto/spx/merkle.c"
   "crypto/spx/spx.c"
+  "crypto/spx/spx_address.c"
+  "crypto/spx/spx_fors.c"
+  "crypto/spx/spx_merkle.c"
+  "crypto/spx/spx_thash.c"
   "crypto/spx/spx_util.c"
-  "crypto/spx/thash.c"
-  "crypto/spx/wots.c"
+  "crypto/spx/spx_wots.c"
   "crypto/stack/stack.c"
   "crypto/thread.c"
   "crypto/thread_none.c"
@@ -319,80 +252,21 @@ add_library(CCryptoBoringSSL STATIC
   "crypto/x509/x_spki.c"
   "crypto/x509/x_val.c"
   "crypto/x509/x_x509.c"
-  "crypto/x509/x_x509a.c")
+  "crypto/x509/x_x509a.c"
+  "gen/crypto/err_data.c")
 
 if(CMAKE_SYSTEM_NAME STREQUAL Darwin AND CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64")
   target_sources(CCryptoBoringSSL PRIVATE
-    crypto/chacha/chacha-x86_64-mac.mac.x86_64.S
-    crypto/cipher_extra/aes128gcmsiv-x86_64-mac.mac.x86_64.S
-    crypto/cipher_extra/chacha20_poly1305_x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/aesni-gcm-x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/aesni-x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/ghash-ssse3-x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/ghash-x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/md5-x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/p256-x86_64-asm-mac.mac.x86_64.S
-    crypto/fipsmodule/p256_beeu-x86_64-asm-mac.mac.x86_64.S
-    crypto/fipsmodule/rdrand-x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/rsaz-avx2-mac.mac.x86_64.S
-    crypto/fipsmodule/sha1-x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/sha256-x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/sha512-x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/vpaes-x86_64-mac.mac.x86_64.S
-    crypto/fipsmodule/x86_64-mont-mac.mac.x86_64.S
-    crypto/fipsmodule/x86_64-mont5-mac.mac.x86_64.S)
+)
 elseif(CMAKE_SYSTEM_NAME MATCHES "Linux|Android" AND CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64")
   target_sources(CCryptoBoringSSL PRIVATE
-    crypto/chacha/chacha-x86_64-linux.linux.x86_64.S
-    crypto/cipher_extra/aes128gcmsiv-x86_64-linux.linux.x86_64.S
-    crypto/cipher_extra/chacha20_poly1305_x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/aesni-gcm-x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/aesni-x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/ghash-ssse3-x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/ghash-x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/md5-x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/p256-x86_64-asm-linux.linux.x86_64.S
-    crypto/fipsmodule/p256_beeu-x86_64-asm-linux.linux.x86_64.S
-    crypto/fipsmodule/rdrand-x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/rsaz-avx2-linux.linux.x86_64.S
-    crypto/fipsmodule/sha1-x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/sha256-x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/sha512-x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/vpaes-x86_64-linux.linux.x86_64.S
-    crypto/fipsmodule/x86_64-mont-linux.linux.x86_64.S
-    crypto/fipsmodule/x86_64-mont5-linux.linux.x86_64.S)
+)
 elseif(CMAKE_SYSTEM_NAME STREQUAL Darwin AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
   target_sources(CCryptoBoringSSL PRIVATE
-    crypto/chacha/chacha-armv8-ios.ios.aarch64.S
-    crypto/cipher_extra/chacha20_poly1305_armv8-ios.ios.aarch64.S
-    crypto/fipsmodule/aesv8-armv8-ios.ios.aarch64.S
-    crypto/fipsmodule/aesv8-gcm-armv8-ios.ios.aarch64.S
-    crypto/fipsmodule/armv8-mont-ios.ios.aarch64.S
-    crypto/fipsmodule/bn-armv8-ios.ios.aarch64.S
-    crypto/fipsmodule/ghash-neon-armv8-ios.ios.aarch64.S
-    crypto/fipsmodule/ghashv8-armv8-ios.ios.aarch64.S
-    crypto/fipsmodule/p256-armv8-asm-ios.ios.aarch64.S
-    crypto/fipsmodule/p256_beeu-armv8-asm-ios.ios.aarch64.S
-    crypto/fipsmodule/sha1-armv8-ios.ios.aarch64.S
-    crypto/fipsmodule/sha256-armv8-ios.ios.aarch64.S
-    crypto/fipsmodule/sha512-armv8-ios.ios.aarch64.S
-    crypto/fipsmodule/vpaes-armv8-ios.ios.aarch64.S)
+)
 elseif(CMAKE_SYSTEM_NAME MATCHES "Linux|Android" AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
   target_sources(CCryptoBoringSSL PRIVATE
-    crypto/chacha/chacha-armv8-linux.linux.aarch64.S
-    crypto/cipher_extra/chacha20_poly1305_armv8-linux.linux.aarch64.S
-    crypto/fipsmodule/aesv8-armv8-linux.linux.aarch64.S
-    crypto/fipsmodule/aesv8-gcm-armv8-linux.linux.aarch64.S
-    crypto/fipsmodule/armv8-mont-linux.linux.aarch64.S
-    crypto/fipsmodule/bn-armv8-linux.linux.aarch64.S
-    crypto/fipsmodule/ghash-neon-armv8-linux.linux.aarch64.S
-    crypto/fipsmodule/ghashv8-armv8-linux.linux.aarch64.S
-    crypto/fipsmodule/p256-armv8-asm-linux.linux.aarch64.S
-    crypto/fipsmodule/p256_beeu-armv8-asm-linux.linux.aarch64.S
-    crypto/fipsmodule/sha1-armv8-linux.linux.aarch64.S
-    crypto/fipsmodule/sha256-armv8-linux.linux.aarch64.S
-    crypto/fipsmodule/sha512-armv8-linux.linux.aarch64.S
-    crypto/fipsmodule/vpaes-armv8-linux.linux.aarch64.S)
+)
 endif()
 
 target_include_directories(CCryptoBoringSSL PUBLIC
diff --git a/Sources/CCryptoBoringSSL/crypto/base64/base64.c b/Sources/CCryptoBoringSSL/crypto/base64/base64.c
index b934e919..34e23e6b 100644
--- a/Sources/CCryptoBoringSSL/crypto/base64/base64.c
+++ b/Sources/CCryptoBoringSSL/crypto/base64/base64.c
@@ -307,6 +307,10 @@ static int base64_decode_quad(uint8_t *out, size_t *out_num_bytes,
                                    (in[2] == '=') << 1 |
                                    (in[3] == '=');
 
+  // In presence of padding, the lowest bits of v are unused. Canonical encoding
+  // (RFC 4648, section 3.5) requires that these bits all be set to zero. Common
+  // PEM parsers accept noncanonical base64, adding to the malleability of the
+  // format. This decoder follows OpenSSL's and Go's PEM parsers and accepts it.
   switch (padding_pattern) {
     case 0:
       // The common case of no padding.
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.h b/Sources/CCryptoBoringSSL/crypto/bcm_support.h
similarity index 55%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.h
rename to Sources/CCryptoBoringSSL/crypto/bcm_support.h
index 0b7112a0..9556c17f 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.h
+++ b/Sources/CCryptoBoringSSL/crypto/bcm_support.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020, Google Inc.
+/* Copyright (c) 2024, Google Inc.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -12,11 +12,17 @@
  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 
-#ifndef OPENSSL_HEADER_CRYPTO_FORK_DETECT_H
-#define OPENSSL_HEADER_CRYPTO_FORK_DETECT_H
+#ifndef OPENSSL_HEADER_CRYPTO_BCM_SUPPORT_H
+#define OPENSSL_HEADER_CRYPTO_BCM_SUPPORT_H
 
 #include <CCryptoBoringSSL_base.h>
 
+// Provided by libcrypto, called from BCM
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
 #if defined(OPENSSL_LINUX)
 // On linux we use MADVISE instead of pthread_atfork(), due
 // to concerns about clone() being used for address space
@@ -29,15 +35,54 @@
 // iOS doesn't normally allow fork in apps, but it's there.
 #define OPENSSL_FORK_DETECTION
 #define OPENSSL_FORK_DETECTION_PTHREAD_ATFORK
-#elif defined(OPENSSL_WINDOWS) || defined(OPENSSL_TRUSTY)
+#elif defined(OPENSSL_WINDOWS) || defined(OPENSSL_TRUSTY) || \
+    defined(__ZEPHYR__) || defined(CROS_EC)
 // These platforms do not fork.
 #define OPENSSL_DOES_NOT_FORK
 #endif
 
-#if defined(__cplusplus)
-extern "C" {
+#if defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE)
+#define OPENSSL_RAND_DETERMINISTIC
+#elif defined(OPENSSL_TRUSTY)
+#define OPENSSL_RAND_TRUSTY
+#elif defined(OPENSSL_WINDOWS)
+#define OPENSSL_RAND_WINDOWS
+#elif defined(OPENSSL_LINUX)
+#define OPENSSL_RAND_URANDOM
+#elif defined(OPENSSL_APPLE) && !defined(OPENSSL_MACOS)
+// Unlike macOS, iOS and similar hide away getentropy().
+#define OPENSSL_RAND_IOS
+#else
+// By default if you are integrating BoringSSL we expect you to
+// provide getentropy from the <unistd.h> header file.
+#define OPENSSL_RAND_GETENTROPY
 #endif
 
+// Provided by libcrypto, called from BCM
+
+// CRYPTO_init_sysrand initializes long-lived resources needed to draw entropy
+// from the operating system, if the operating system requires initialization.
+void CRYPTO_init_sysrand(void);
+
+// CRYPTO_sysrand fills |len| bytes at |buf| with entropy from the operating
+// system.
+void CRYPTO_sysrand(uint8_t *buf, size_t len);
+
+// CRYPTO_sysrand_if_available fills |len| bytes at |buf| with entropy from the
+// operating system, or early /dev/urandom data, and returns 1, _if_ the entropy
+// pool is initialized or if getrandom() is not available and not in FIPS mode.
+// Otherwise it will not block and will instead fill |buf| with all zeros and
+// return 0.
+int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len);
+
+// CRYPTO_sysrand_for_seed fills |len| bytes at |buf| with entropy from the
+// operating system. It may draw from the |GRND_RANDOM| pool on Android,
+// depending on the vendor's configuration.
+void CRYPTO_sysrand_for_seed(uint8_t *buf, size_t len);
+
+// RAND_need_entropy is called whenever the BCM module has stopped because it
+// has run out of entropy.
+void RAND_need_entropy(size_t bytes_needed);
 
 // crypto_get_fork_generation returns the fork generation number for the current
 // process, or zero if not supported on the platform. The fork generation number
@@ -60,8 +105,9 @@ OPENSSL_EXPORT uint64_t CRYPTO_get_fork_generation(void);
 OPENSSL_EXPORT void CRYPTO_fork_detect_force_madv_wipeonfork_for_testing(
     int on);
 
+
 #if defined(__cplusplus)
 }  // extern C
 #endif
 
-#endif  // OPENSSL_HEADER_CRYPTO_FORK_DETECT_H
+#endif  // OPENSSL_HEADER_CRYPTO_BCM_SUPPORT_H
diff --git a/Sources/CCryptoBoringSSL/crypto/bio/bio.c b/Sources/CCryptoBoringSSL/crypto/bio/bio.c
index 77181653..0de106b9 100644
--- a/Sources/CCryptoBoringSSL/crypto/bio/bio.c
+++ b/Sources/CCryptoBoringSSL/crypto/bio/bio.c
@@ -658,38 +658,38 @@ void BIO_meth_free(BIO_METHOD *method) {
 }
 
 int BIO_meth_set_create(BIO_METHOD *method,
-                        int (*create)(BIO *)) {
-  method->create = create;
+                        int (*create_func)(BIO *)) {
+  method->create = create_func;
   return 1;
 }
 
 int BIO_meth_set_destroy(BIO_METHOD *method,
-                         int (*destroy)(BIO *)) {
-  method->destroy = destroy;
+                         int (*destroy_func)(BIO *)) {
+  method->destroy = destroy_func;
   return 1;
 }
 
 int BIO_meth_set_write(BIO_METHOD *method,
-                       int (*write)(BIO *, const char *, int)) {
-  method->bwrite = write;
+                       int (*write_func)(BIO *, const char *, int)) {
+  method->bwrite = write_func;
   return 1;
 }
 
 int BIO_meth_set_read(BIO_METHOD *method,
-                      int (*read)(BIO *, char *, int)) {
-  method->bread = read;
+                      int (*read_func)(BIO *, char *, int)) {
+  method->bread = read_func;
   return 1;
 }
 
 int BIO_meth_set_gets(BIO_METHOD *method,
-                      int (*gets)(BIO *, char *, int)) {
-  method->bgets = gets;
+                      int (*gets_func)(BIO *, char *, int)) {
+  method->bgets = gets_func;
   return 1;
 }
 
 int BIO_meth_set_ctrl(BIO_METHOD *method,
-                      long (*ctrl)(BIO *, int, long, void *)) {
-  method->ctrl = ctrl;
+                      long (*ctrl_func)(BIO *, int, long, void *)) {
+  method->ctrl = ctrl_func;
   return 1;
 }
 
diff --git a/Sources/CCryptoBoringSSL/crypto/bytestring/cbs.c b/Sources/CCryptoBoringSSL/crypto/bytestring/cbs.c
index ee5a41b2..625c6a06 100644
--- a/Sources/CCryptoBoringSSL/crypto/bytestring/cbs.c
+++ b/Sources/CCryptoBoringSSL/crypto/bytestring/cbs.c
@@ -507,11 +507,9 @@ int CBS_get_asn1_int64(CBS *cbs, int64_t *out) {
     return 0;
   }
   uint8_t sign_extend[sizeof(int64_t)];
-  memset(sign_extend, is_negative ? 0xff : 0, sizeof(sign_extend));
-  for (size_t i = 0; i < len; i++) {
-    sign_extend[i] = data[len - i - 1];
-  }
-  memcpy(out, sign_extend, sizeof(sign_extend));
+  OPENSSL_memset(sign_extend, is_negative ? 0xff : 0, sizeof(sign_extend));
+  OPENSSL_memcpy(sign_extend + sizeof(int64_t) - len, data, len);
+  *out = CRYPTO_load_u64_be(sign_extend);
   return 1;
 }
 
diff --git a/Sources/CCryptoBoringSSL/crypto/bytestring/unicode.c b/Sources/CCryptoBoringSSL/crypto/bytestring/unicode.c
index 00e603c1..4f990aef 100644
--- a/Sources/CCryptoBoringSSL/crypto/bytestring/unicode.c
+++ b/Sources/CCryptoBoringSSL/crypto/bytestring/unicode.c
@@ -18,11 +18,12 @@
 
 
 static int is_valid_code_point(uint32_t v) {
-  // References in the following are to Unicode 9.0.0.
+  // References in the following are to Unicode 15.0.0.
   if (// The Unicode space runs from zero to 0x10ffff (3.4 D9).
       v > 0x10ffff ||
       // Values 0x...fffe, 0x...ffff, and 0xfdd0-0xfdef are permanently reserved
-      // (3.4 D14)
+      // as noncharacters (3.4 D14). See also 23.7. As our APIs are intended for
+      // "open interchange", such as ASN.1, we reject them.
       (v & 0xfffe) == 0xfffe ||
       (v >= 0xfdd0 && v <= 0xfdef) ||
       // Surrogate code points are invalid (3.2 C1).
diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-ios.ios.arm.S
deleted file mode 100644
index 39e6890e..00000000
--- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-ios.ios.arm.S
+++ /dev/null
@@ -1,1462 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__APPLE__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <CCryptoBoringSSL_arm_arch.h>
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
-
-
-.text
-#if defined(__thumb2__) || defined(__clang__)
-.syntax	unified
-#endif
-#if defined(__thumb2__)
-.thumb
-#else
-.code	32
-#endif
-
-#if defined(__thumb2__) || defined(__clang__)
-#define ldrhsb	ldrbhs
-#endif
-
-.align	5
-Lsigma:
-.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
-Lone:
-.long	1,0,0,0
-
-.globl	_ChaCha20_ctr32_nohw
-.private_extern	_ChaCha20_ctr32_nohw
-#ifdef __thumb2__
-.thumb_func	_ChaCha20_ctr32_nohw
-#endif
-.align	5
-_ChaCha20_ctr32_nohw:
-	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
-	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
-	adr	r14,Lsigma
-	ldmia	r12,{r4,r5,r6,r7}		@ load counter and nonce
-	sub	sp,sp,#4*(16)		@ off-load area
-	stmdb	sp!,{r4,r5,r6,r7}		@ copy counter and nonce
-	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
-	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy key
-	stmdb	sp!,{r0,r1,r2,r3}		@ copy sigma
-	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
-	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
-	b	Loop_outer_enter
-
-.align	4
-Loop_outer:
-	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
-	str	r11,[sp,#4*(32+2)]	@ save len
-	str	r12,  [sp,#4*(32+1)]	@ save inp
-	str	r14,  [sp,#4*(32+0)]	@ save out
-Loop_outer_enter:
-	ldr	r11, [sp,#4*(15)]
-	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
-	ldr	r10, [sp,#4*(13)]
-	ldr	r14,[sp,#4*(14)]
-	str	r11, [sp,#4*(16+15)]
-	mov	r11,#10
-	b	Loop
-
-.align	4
-Loop:
-	subs	r11,r11,#1
-	add	r0,r0,r4
-	mov	r12,r12,ror#16
-	add	r1,r1,r5
-	mov	r10,r10,ror#16
-	eor	r12,r12,r0,ror#16
-	eor	r10,r10,r1,ror#16
-	add	r8,r8,r12
-	mov	r4,r4,ror#20
-	add	r9,r9,r10
-	mov	r5,r5,ror#20
-	eor	r4,r4,r8,ror#20
-	eor	r5,r5,r9,ror#20
-	add	r0,r0,r4
-	mov	r12,r12,ror#24
-	add	r1,r1,r5
-	mov	r10,r10,ror#24
-	eor	r12,r12,r0,ror#24
-	eor	r10,r10,r1,ror#24
-	add	r8,r8,r12
-	mov	r4,r4,ror#25
-	add	r9,r9,r10
-	mov	r5,r5,ror#25
-	str	r10,[sp,#4*(16+13)]
-	ldr	r10,[sp,#4*(16+15)]
-	eor	r4,r4,r8,ror#25
-	eor	r5,r5,r9,ror#25
-	str	r8,[sp,#4*(16+8)]
-	ldr	r8,[sp,#4*(16+10)]
-	add	r2,r2,r6
-	mov	r14,r14,ror#16
-	str	r9,[sp,#4*(16+9)]
-	ldr	r9,[sp,#4*(16+11)]
-	add	r3,r3,r7
-	mov	r10,r10,ror#16
-	eor	r14,r14,r2,ror#16
-	eor	r10,r10,r3,ror#16
-	add	r8,r8,r14
-	mov	r6,r6,ror#20
-	add	r9,r9,r10
-	mov	r7,r7,ror#20
-	eor	r6,r6,r8,ror#20
-	eor	r7,r7,r9,ror#20
-	add	r2,r2,r6
-	mov	r14,r14,ror#24
-	add	r3,r3,r7
-	mov	r10,r10,ror#24
-	eor	r14,r14,r2,ror#24
-	eor	r10,r10,r3,ror#24
-	add	r8,r8,r14
-	mov	r6,r6,ror#25
-	add	r9,r9,r10
-	mov	r7,r7,ror#25
-	eor	r6,r6,r8,ror#25
-	eor	r7,r7,r9,ror#25
-	add	r0,r0,r5
-	mov	r10,r10,ror#16
-	add	r1,r1,r6
-	mov	r12,r12,ror#16
-	eor	r10,r10,r0,ror#16
-	eor	r12,r12,r1,ror#16
-	add	r8,r8,r10
-	mov	r5,r5,ror#20
-	add	r9,r9,r12
-	mov	r6,r6,ror#20
-	eor	r5,r5,r8,ror#20
-	eor	r6,r6,r9,ror#20
-	add	r0,r0,r5
-	mov	r10,r10,ror#24
-	add	r1,r1,r6
-	mov	r12,r12,ror#24
-	eor	r10,r10,r0,ror#24
-	eor	r12,r12,r1,ror#24
-	add	r8,r8,r10
-	mov	r5,r5,ror#25
-	str	r10,[sp,#4*(16+15)]
-	ldr	r10,[sp,#4*(16+13)]
-	add	r9,r9,r12
-	mov	r6,r6,ror#25
-	eor	r5,r5,r8,ror#25
-	eor	r6,r6,r9,ror#25
-	str	r8,[sp,#4*(16+10)]
-	ldr	r8,[sp,#4*(16+8)]
-	add	r2,r2,r7
-	mov	r10,r10,ror#16
-	str	r9,[sp,#4*(16+11)]
-	ldr	r9,[sp,#4*(16+9)]
-	add	r3,r3,r4
-	mov	r14,r14,ror#16
-	eor	r10,r10,r2,ror#16
-	eor	r14,r14,r3,ror#16
-	add	r8,r8,r10
-	mov	r7,r7,ror#20
-	add	r9,r9,r14
-	mov	r4,r4,ror#20
-	eor	r7,r7,r8,ror#20
-	eor	r4,r4,r9,ror#20
-	add	r2,r2,r7
-	mov	r10,r10,ror#24
-	add	r3,r3,r4
-	mov	r14,r14,ror#24
-	eor	r10,r10,r2,ror#24
-	eor	r14,r14,r3,ror#24
-	add	r8,r8,r10
-	mov	r7,r7,ror#25
-	add	r9,r9,r14
-	mov	r4,r4,ror#25
-	eor	r7,r7,r8,ror#25
-	eor	r4,r4,r9,ror#25
-	bne	Loop
-
-	ldr	r11,[sp,#4*(32+2)]	@ load len
-
-	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
-	str	r9, [sp,#4*(16+9)]
-	str	r12,[sp,#4*(16+12)]
-	str	r10, [sp,#4*(16+13)]
-	str	r14,[sp,#4*(16+14)]
-
-	@ at this point we have first half of 512-bit result in
-	@ rx and second half at sp+4*(16+8)
-
-	cmp	r11,#64		@ done yet?
-#ifdef	__thumb2__
-	itete	lo
-#endif
-	addlo	r12,sp,#4*(0)		@ shortcut or ...
-	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
-	addlo	r14,sp,#4*(0)		@ shortcut or ...
-	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
-
-	ldr	r8,[sp,#4*(0)]	@ load key material
-	ldr	r9,[sp,#4*(1)]
-
-#if __ARM_ARCH>=6 || !defined(__ARMEB__)
-# if __ARM_ARCH<7
-	orr	r10,r12,r14
-	tst	r10,#3		@ are input and output aligned?
-	ldr	r10,[sp,#4*(2)]
-	bne	Lunaligned
-	cmp	r11,#64		@ restore flags
-# else
-	ldr	r10,[sp,#4*(2)]
-# endif
-	ldr	r11,[sp,#4*(3)]
-
-	add	r0,r0,r8	@ accumulate key material
-	add	r1,r1,r9
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r8,[r12],#16		@ load input
-	ldrhs	r9,[r12,#-12]
-
-	add	r2,r2,r10
-	add	r3,r3,r11
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r10,[r12,#-8]
-	ldrhs	r11,[r12,#-4]
-# if __ARM_ARCH>=6 && defined(__ARMEB__)
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r0,r0,r8	@ xor with input
-	eorhs	r1,r1,r9
-	add	r8,sp,#4*(4)
-	str	r0,[r14],#16		@ store output
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r2,r2,r10
-	eorhs	r3,r3,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r1,[r14,#-12]
-	str	r2,[r14,#-8]
-	str	r3,[r14,#-4]
-
-	add	r4,r4,r8	@ accumulate key material
-	add	r5,r5,r9
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r8,[r12],#16		@ load input
-	ldrhs	r9,[r12,#-12]
-	add	r6,r6,r10
-	add	r7,r7,r11
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r10,[r12,#-8]
-	ldrhs	r11,[r12,#-4]
-# if __ARM_ARCH>=6 && defined(__ARMEB__)
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r4,r4,r8
-	eorhs	r5,r5,r9
-	add	r8,sp,#4*(8)
-	str	r4,[r14],#16		@ store output
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r6,r6,r10
-	eorhs	r7,r7,r11
-	str	r5,[r14,#-12]
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r6,[r14,#-8]
-	add	r0,sp,#4*(16+8)
-	str	r7,[r14,#-4]
-
-	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
-
-	add	r0,r0,r8	@ accumulate key material
-	add	r1,r1,r9
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r8,[r12],#16		@ load input
-	ldrhs	r9,[r12,#-12]
-# ifdef	__thumb2__
-	itt	hi
-# endif
-	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
-	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
-	add	r2,r2,r10
-	add	r3,r3,r11
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r10,[r12,#-8]
-	ldrhs	r11,[r12,#-4]
-# if __ARM_ARCH>=6 && defined(__ARMEB__)
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r0,r0,r8
-	eorhs	r1,r1,r9
-	add	r8,sp,#4*(12)
-	str	r0,[r14],#16		@ store output
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r2,r2,r10
-	eorhs	r3,r3,r11
-	str	r1,[r14,#-12]
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r2,[r14,#-8]
-	str	r3,[r14,#-4]
-
-	add	r4,r4,r8	@ accumulate key material
-	add	r5,r5,r9
-# ifdef	__thumb2__
-	itt	hi
-# endif
-	addhi	r8,r8,#1		@ next counter value
-	strhi	r8,[sp,#4*(12)]	@ save next counter value
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r8,[r12],#16		@ load input
-	ldrhs	r9,[r12,#-12]
-	add	r6,r6,r10
-	add	r7,r7,r11
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r10,[r12,#-8]
-	ldrhs	r11,[r12,#-4]
-# if __ARM_ARCH>=6 && defined(__ARMEB__)
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r4,r4,r8
-	eorhs	r5,r5,r9
-# ifdef	__thumb2__
-	it	ne
-# endif
-	ldrne	r8,[sp,#4*(32+2)]	@ re-load len
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r6,r6,r10
-	eorhs	r7,r7,r11
-	str	r4,[r14],#16		@ store output
-	str	r5,[r14,#-12]
-# ifdef	__thumb2__
-	it	hs
-# endif
-	subhs	r11,r8,#64		@ len-=64
-	str	r6,[r14,#-8]
-	str	r7,[r14,#-4]
-	bhi	Loop_outer
-
-	beq	Ldone
-# if __ARM_ARCH<7
-	b	Ltail
-
-.align	4
-Lunaligned:@ unaligned endian-neutral path
-	cmp	r11,#64		@ restore flags
-# endif
-#endif
-#if __ARM_ARCH<7
-	ldr	r11,[sp,#4*(3)]
-	add	r0,r0,r8		@ accumulate key material
-	add	r1,r1,r9
-	add	r2,r2,r10
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r8,r8,r8		@ zero or ...
-	ldrhsb	r8,[r12],#16			@ ... load input
-	eorlo	r9,r9,r9
-	ldrhsb	r9,[r12,#-12]
-
-	add	r3,r3,r11
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r10,r10,r10
-	ldrhsb	r10,[r12,#-8]
-	eorlo	r11,r11,r11
-	ldrhsb	r11,[r12,#-4]
-
-	eor	r0,r8,r0		@ xor with input (or zero)
-	eor	r1,r9,r1
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-15]		@ load more input
-	ldrhsb	r9,[r12,#-11]
-	eor	r2,r10,r2
-	strb	r0,[r14],#16		@ store output
-	eor	r3,r11,r3
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-7]
-	ldrhsb	r11,[r12,#-3]
-	strb	r1,[r14,#-12]
-	eor	r0,r8,r0,lsr#8
-	strb	r2,[r14,#-8]
-	eor	r1,r9,r1,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-14]		@ load more input
-	ldrhsb	r9,[r12,#-10]
-	strb	r3,[r14,#-4]
-	eor	r2,r10,r2,lsr#8
-	strb	r0,[r14,#-15]
-	eor	r3,r11,r3,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-6]
-	ldrhsb	r11,[r12,#-2]
-	strb	r1,[r14,#-11]
-	eor	r0,r8,r0,lsr#8
-	strb	r2,[r14,#-7]
-	eor	r1,r9,r1,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-13]		@ load more input
-	ldrhsb	r9,[r12,#-9]
-	strb	r3,[r14,#-3]
-	eor	r2,r10,r2,lsr#8
-	strb	r0,[r14,#-14]
-	eor	r3,r11,r3,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-5]
-	ldrhsb	r11,[r12,#-1]
-	strb	r1,[r14,#-10]
-	strb	r2,[r14,#-6]
-	eor	r0,r8,r0,lsr#8
-	strb	r3,[r14,#-2]
-	eor	r1,r9,r1,lsr#8
-	strb	r0,[r14,#-13]
-	eor	r2,r10,r2,lsr#8
-	strb	r1,[r14,#-9]
-	eor	r3,r11,r3,lsr#8
-	strb	r2,[r14,#-5]
-	strb	r3,[r14,#-1]
-	add	r8,sp,#4*(4+0)
-	ldmia	r8,{r8,r9,r10,r11}		@ load key material
-	add	r0,sp,#4*(16+8)
-	add	r4,r4,r8		@ accumulate key material
-	add	r5,r5,r9
-	add	r6,r6,r10
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r8,r8,r8		@ zero or ...
-	ldrhsb	r8,[r12],#16			@ ... load input
-	eorlo	r9,r9,r9
-	ldrhsb	r9,[r12,#-12]
-
-	add	r7,r7,r11
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r10,r10,r10
-	ldrhsb	r10,[r12,#-8]
-	eorlo	r11,r11,r11
-	ldrhsb	r11,[r12,#-4]
-
-	eor	r4,r8,r4		@ xor with input (or zero)
-	eor	r5,r9,r5
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-15]		@ load more input
-	ldrhsb	r9,[r12,#-11]
-	eor	r6,r10,r6
-	strb	r4,[r14],#16		@ store output
-	eor	r7,r11,r7
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-7]
-	ldrhsb	r11,[r12,#-3]
-	strb	r5,[r14,#-12]
-	eor	r4,r8,r4,lsr#8
-	strb	r6,[r14,#-8]
-	eor	r5,r9,r5,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-14]		@ load more input
-	ldrhsb	r9,[r12,#-10]
-	strb	r7,[r14,#-4]
-	eor	r6,r10,r6,lsr#8
-	strb	r4,[r14,#-15]
-	eor	r7,r11,r7,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-6]
-	ldrhsb	r11,[r12,#-2]
-	strb	r5,[r14,#-11]
-	eor	r4,r8,r4,lsr#8
-	strb	r6,[r14,#-7]
-	eor	r5,r9,r5,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-13]		@ load more input
-	ldrhsb	r9,[r12,#-9]
-	strb	r7,[r14,#-3]
-	eor	r6,r10,r6,lsr#8
-	strb	r4,[r14,#-14]
-	eor	r7,r11,r7,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-5]
-	ldrhsb	r11,[r12,#-1]
-	strb	r5,[r14,#-10]
-	strb	r6,[r14,#-6]
-	eor	r4,r8,r4,lsr#8
-	strb	r7,[r14,#-2]
-	eor	r5,r9,r5,lsr#8
-	strb	r4,[r14,#-13]
-	eor	r6,r10,r6,lsr#8
-	strb	r5,[r14,#-9]
-	eor	r7,r11,r7,lsr#8
-	strb	r6,[r14,#-5]
-	strb	r7,[r14,#-1]
-	add	r8,sp,#4*(4+4)
-	ldmia	r8,{r8,r9,r10,r11}		@ load key material
-	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}		@ load second half
-# ifdef	__thumb2__
-	itt	hi
-# endif
-	strhi	r10,[sp,#4*(16+10)]		@ copy "rx"
-	strhi	r11,[sp,#4*(16+11)]		@ copy "rx"
-	add	r0,r0,r8		@ accumulate key material
-	add	r1,r1,r9
-	add	r2,r2,r10
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r8,r8,r8		@ zero or ...
-	ldrhsb	r8,[r12],#16			@ ... load input
-	eorlo	r9,r9,r9
-	ldrhsb	r9,[r12,#-12]
-
-	add	r3,r3,r11
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r10,r10,r10
-	ldrhsb	r10,[r12,#-8]
-	eorlo	r11,r11,r11
-	ldrhsb	r11,[r12,#-4]
-
-	eor	r0,r8,r0		@ xor with input (or zero)
-	eor	r1,r9,r1
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-15]		@ load more input
-	ldrhsb	r9,[r12,#-11]
-	eor	r2,r10,r2
-	strb	r0,[r14],#16		@ store output
-	eor	r3,r11,r3
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-7]
-	ldrhsb	r11,[r12,#-3]
-	strb	r1,[r14,#-12]
-	eor	r0,r8,r0,lsr#8
-	strb	r2,[r14,#-8]
-	eor	r1,r9,r1,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-14]		@ load more input
-	ldrhsb	r9,[r12,#-10]
-	strb	r3,[r14,#-4]
-	eor	r2,r10,r2,lsr#8
-	strb	r0,[r14,#-15]
-	eor	r3,r11,r3,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-6]
-	ldrhsb	r11,[r12,#-2]
-	strb	r1,[r14,#-11]
-	eor	r0,r8,r0,lsr#8
-	strb	r2,[r14,#-7]
-	eor	r1,r9,r1,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-13]		@ load more input
-	ldrhsb	r9,[r12,#-9]
-	strb	r3,[r14,#-3]
-	eor	r2,r10,r2,lsr#8
-	strb	r0,[r14,#-14]
-	eor	r3,r11,r3,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-5]
-	ldrhsb	r11,[r12,#-1]
-	strb	r1,[r14,#-10]
-	strb	r2,[r14,#-6]
-	eor	r0,r8,r0,lsr#8
-	strb	r3,[r14,#-2]
-	eor	r1,r9,r1,lsr#8
-	strb	r0,[r14,#-13]
-	eor	r2,r10,r2,lsr#8
-	strb	r1,[r14,#-9]
-	eor	r3,r11,r3,lsr#8
-	strb	r2,[r14,#-5]
-	strb	r3,[r14,#-1]
-	add	r8,sp,#4*(4+8)
-	ldmia	r8,{r8,r9,r10,r11}		@ load key material
-	add	r4,r4,r8		@ accumulate key material
-# ifdef	__thumb2__
-	itt	hi
-# endif
-	addhi	r8,r8,#1			@ next counter value
-	strhi	r8,[sp,#4*(12)]		@ save next counter value
-	add	r5,r5,r9
-	add	r6,r6,r10
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r8,r8,r8		@ zero or ...
-	ldrhsb	r8,[r12],#16			@ ... load input
-	eorlo	r9,r9,r9
-	ldrhsb	r9,[r12,#-12]
-
-	add	r7,r7,r11
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r10,r10,r10
-	ldrhsb	r10,[r12,#-8]
-	eorlo	r11,r11,r11
-	ldrhsb	r11,[r12,#-4]
-
-	eor	r4,r8,r4		@ xor with input (or zero)
-	eor	r5,r9,r5
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-15]		@ load more input
-	ldrhsb	r9,[r12,#-11]
-	eor	r6,r10,r6
-	strb	r4,[r14],#16		@ store output
-	eor	r7,r11,r7
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-7]
-	ldrhsb	r11,[r12,#-3]
-	strb	r5,[r14,#-12]
-	eor	r4,r8,r4,lsr#8
-	strb	r6,[r14,#-8]
-	eor	r5,r9,r5,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-14]		@ load more input
-	ldrhsb	r9,[r12,#-10]
-	strb	r7,[r14,#-4]
-	eor	r6,r10,r6,lsr#8
-	strb	r4,[r14,#-15]
-	eor	r7,r11,r7,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-6]
-	ldrhsb	r11,[r12,#-2]
-	strb	r5,[r14,#-11]
-	eor	r4,r8,r4,lsr#8
-	strb	r6,[r14,#-7]
-	eor	r5,r9,r5,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-13]		@ load more input
-	ldrhsb	r9,[r12,#-9]
-	strb	r7,[r14,#-3]
-	eor	r6,r10,r6,lsr#8
-	strb	r4,[r14,#-14]
-	eor	r7,r11,r7,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-5]
-	ldrhsb	r11,[r12,#-1]
-	strb	r5,[r14,#-10]
-	strb	r6,[r14,#-6]
-	eor	r4,r8,r4,lsr#8
-	strb	r7,[r14,#-2]
-	eor	r5,r9,r5,lsr#8
-	strb	r4,[r14,#-13]
-	eor	r6,r10,r6,lsr#8
-	strb	r5,[r14,#-9]
-	eor	r7,r11,r7,lsr#8
-	strb	r6,[r14,#-5]
-	strb	r7,[r14,#-1]
-# ifdef	__thumb2__
-	it	ne
-# endif
-	ldrne	r8,[sp,#4*(32+2)]		@ re-load len
-# ifdef	__thumb2__
-	it	hs
-# endif
-	subhs	r11,r8,#64			@ len-=64
-	bhi	Loop_outer
-
-	beq	Ldone
-#endif
-
-Ltail:
-	ldr	r12,[sp,#4*(32+1)]	@ load inp
-	add	r9,sp,#4*(0)
-	ldr	r14,[sp,#4*(32+0)]	@ load out
-
-Loop_tail:
-	ldrb	r10,[r9],#1	@ read buffer on stack
-	ldrb	r11,[r12],#1		@ read input
-	subs	r8,r8,#1
-	eor	r11,r11,r10
-	strb	r11,[r14],#1		@ store output
-	bne	Loop_tail
-
-Ldone:
-	add	sp,sp,#4*(32+3)
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl	_ChaCha20_ctr32_neon
-.private_extern	_ChaCha20_ctr32_neon
-#ifdef __thumb2__
-.thumb_func	_ChaCha20_ctr32_neon
-#endif
-.align	5
-_ChaCha20_ctr32_neon:
-	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
-	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
-	adr	r14,Lsigma
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI spec says so
-	stmdb	sp!,{r0,r1,r2,r3}
-
-	vld1.32	{q1,q2},[r3]		@ load key
-	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
-
-	sub	sp,sp,#4*(16+16)
-	vld1.32	{q3},[r12]		@ load counter and nonce
-	add	r12,sp,#4*8
-	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
-	vld1.32	{q0},[r14]!		@ load sigma
-	vld1.32	{q12},[r14]		@ one
-	vst1.32	{q2,q3},[r12]		@ copy 1/2key|counter|nonce
-	vst1.32	{q0,q1},[sp]		@ copy sigma|1/2key
-
-	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
-	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
-	vshl.i32	d26,d24,#1	@ two
-	vstr	d24,[sp,#4*(16+0)]
-	vshl.i32	d28,d24,#2	@ four
-	vstr	d26,[sp,#4*(16+2)]
-	vmov	q4,q0
-	vstr	d28,[sp,#4*(16+4)]
-	vmov	q8,q0
-	vmov	q5,q1
-	vmov	q9,q1
-	b	Loop_neon_enter
-
-.align	4
-Loop_neon_outer:
-	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
-	cmp	r11,#64*2		@ if len<=64*2
-	bls	Lbreak_neon		@ switch to integer-only
-	vmov	q4,q0
-	str	r11,[sp,#4*(32+2)]	@ save len
-	vmov	q8,q0
-	str	r12,  [sp,#4*(32+1)]	@ save inp
-	vmov	q5,q1
-	str	r14,  [sp,#4*(32+0)]	@ save out
-	vmov	q9,q1
-Loop_neon_enter:
-	ldr	r11, [sp,#4*(15)]
-	vadd.i32	q7,q3,q12		@ counter+1
-	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
-	vmov	q6,q2
-	ldr	r10, [sp,#4*(13)]
-	vmov	q10,q2
-	ldr	r14,[sp,#4*(14)]
-	vadd.i32	q11,q7,q12		@ counter+2
-	str	r11, [sp,#4*(16+15)]
-	mov	r11,#10
-	add	r12,r12,#3	@ counter+3
-	b	Loop_neon
-
-.align	4
-Loop_neon:
-	subs	r11,r11,#1
-	vadd.i32	q0,q0,q1
-	add	r0,r0,r4
-	vadd.i32	q4,q4,q5
-	mov	r12,r12,ror#16
-	vadd.i32	q8,q8,q9
-	add	r1,r1,r5
-	veor	q3,q3,q0
-	mov	r10,r10,ror#16
-	veor	q7,q7,q4
-	eor	r12,r12,r0,ror#16
-	veor	q11,q11,q8
-	eor	r10,r10,r1,ror#16
-	vrev32.16	q3,q3
-	add	r8,r8,r12
-	vrev32.16	q7,q7
-	mov	r4,r4,ror#20
-	vrev32.16	q11,q11
-	add	r9,r9,r10
-	vadd.i32	q2,q2,q3
-	mov	r5,r5,ror#20
-	vadd.i32	q6,q6,q7
-	eor	r4,r4,r8,ror#20
-	vadd.i32	q10,q10,q11
-	eor	r5,r5,r9,ror#20
-	veor	q12,q1,q2
-	add	r0,r0,r4
-	veor	q13,q5,q6
-	mov	r12,r12,ror#24
-	veor	q14,q9,q10
-	add	r1,r1,r5
-	vshr.u32	q1,q12,#20
-	mov	r10,r10,ror#24
-	vshr.u32	q5,q13,#20
-	eor	r12,r12,r0,ror#24
-	vshr.u32	q9,q14,#20
-	eor	r10,r10,r1,ror#24
-	vsli.32	q1,q12,#12
-	add	r8,r8,r12
-	vsli.32	q5,q13,#12
-	mov	r4,r4,ror#25
-	vsli.32	q9,q14,#12
-	add	r9,r9,r10
-	vadd.i32	q0,q0,q1
-	mov	r5,r5,ror#25
-	vadd.i32	q4,q4,q5
-	str	r10,[sp,#4*(16+13)]
-	vadd.i32	q8,q8,q9
-	ldr	r10,[sp,#4*(16+15)]
-	veor	q12,q3,q0
-	eor	r4,r4,r8,ror#25
-	veor	q13,q7,q4
-	eor	r5,r5,r9,ror#25
-	veor	q14,q11,q8
-	str	r8,[sp,#4*(16+8)]
-	vshr.u32	q3,q12,#24
-	ldr	r8,[sp,#4*(16+10)]
-	vshr.u32	q7,q13,#24
-	add	r2,r2,r6
-	vshr.u32	q11,q14,#24
-	mov	r14,r14,ror#16
-	vsli.32	q3,q12,#8
-	str	r9,[sp,#4*(16+9)]
-	vsli.32	q7,q13,#8
-	ldr	r9,[sp,#4*(16+11)]
-	vsli.32	q11,q14,#8
-	add	r3,r3,r7
-	vadd.i32	q2,q2,q3
-	mov	r10,r10,ror#16
-	vadd.i32	q6,q6,q7
-	eor	r14,r14,r2,ror#16
-	vadd.i32	q10,q10,q11
-	eor	r10,r10,r3,ror#16
-	veor	q12,q1,q2
-	add	r8,r8,r14
-	veor	q13,q5,q6
-	mov	r6,r6,ror#20
-	veor	q14,q9,q10
-	add	r9,r9,r10
-	vshr.u32	q1,q12,#25
-	mov	r7,r7,ror#20
-	vshr.u32	q5,q13,#25
-	eor	r6,r6,r8,ror#20
-	vshr.u32	q9,q14,#25
-	eor	r7,r7,r9,ror#20
-	vsli.32	q1,q12,#7
-	add	r2,r2,r6
-	vsli.32	q5,q13,#7
-	mov	r14,r14,ror#24
-	vsli.32	q9,q14,#7
-	add	r3,r3,r7
-	vext.8	q2,q2,q2,#8
-	mov	r10,r10,ror#24
-	vext.8	q6,q6,q6,#8
-	eor	r14,r14,r2,ror#24
-	vext.8	q10,q10,q10,#8
-	eor	r10,r10,r3,ror#24
-	vext.8	q1,q1,q1,#4
-	add	r8,r8,r14
-	vext.8	q5,q5,q5,#4
-	mov	r6,r6,ror#25
-	vext.8	q9,q9,q9,#4
-	add	r9,r9,r10
-	vext.8	q3,q3,q3,#12
-	mov	r7,r7,ror#25
-	vext.8	q7,q7,q7,#12
-	eor	r6,r6,r8,ror#25
-	vext.8	q11,q11,q11,#12
-	eor	r7,r7,r9,ror#25
-	vadd.i32	q0,q0,q1
-	add	r0,r0,r5
-	vadd.i32	q4,q4,q5
-	mov	r10,r10,ror#16
-	vadd.i32	q8,q8,q9
-	add	r1,r1,r6
-	veor	q3,q3,q0
-	mov	r12,r12,ror#16
-	veor	q7,q7,q4
-	eor	r10,r10,r0,ror#16
-	veor	q11,q11,q8
-	eor	r12,r12,r1,ror#16
-	vrev32.16	q3,q3
-	add	r8,r8,r10
-	vrev32.16	q7,q7
-	mov	r5,r5,ror#20
-	vrev32.16	q11,q11
-	add	r9,r9,r12
-	vadd.i32	q2,q2,q3
-	mov	r6,r6,ror#20
-	vadd.i32	q6,q6,q7
-	eor	r5,r5,r8,ror#20
-	vadd.i32	q10,q10,q11
-	eor	r6,r6,r9,ror#20
-	veor	q12,q1,q2
-	add	r0,r0,r5
-	veor	q13,q5,q6
-	mov	r10,r10,ror#24
-	veor	q14,q9,q10
-	add	r1,r1,r6
-	vshr.u32	q1,q12,#20
-	mov	r12,r12,ror#24
-	vshr.u32	q5,q13,#20
-	eor	r10,r10,r0,ror#24
-	vshr.u32	q9,q14,#20
-	eor	r12,r12,r1,ror#24
-	vsli.32	q1,q12,#12
-	add	r8,r8,r10
-	vsli.32	q5,q13,#12
-	mov	r5,r5,ror#25
-	vsli.32	q9,q14,#12
-	str	r10,[sp,#4*(16+15)]
-	vadd.i32	q0,q0,q1
-	ldr	r10,[sp,#4*(16+13)]
-	vadd.i32	q4,q4,q5
-	add	r9,r9,r12
-	vadd.i32	q8,q8,q9
-	mov	r6,r6,ror#25
-	veor	q12,q3,q0
-	eor	r5,r5,r8,ror#25
-	veor	q13,q7,q4
-	eor	r6,r6,r9,ror#25
-	veor	q14,q11,q8
-	str	r8,[sp,#4*(16+10)]
-	vshr.u32	q3,q12,#24
-	ldr	r8,[sp,#4*(16+8)]
-	vshr.u32	q7,q13,#24
-	add	r2,r2,r7
-	vshr.u32	q11,q14,#24
-	mov	r10,r10,ror#16
-	vsli.32	q3,q12,#8
-	str	r9,[sp,#4*(16+11)]
-	vsli.32	q7,q13,#8
-	ldr	r9,[sp,#4*(16+9)]
-	vsli.32	q11,q14,#8
-	add	r3,r3,r4
-	vadd.i32	q2,q2,q3
-	mov	r14,r14,ror#16
-	vadd.i32	q6,q6,q7
-	eor	r10,r10,r2,ror#16
-	vadd.i32	q10,q10,q11
-	eor	r14,r14,r3,ror#16
-	veor	q12,q1,q2
-	add	r8,r8,r10
-	veor	q13,q5,q6
-	mov	r7,r7,ror#20
-	veor	q14,q9,q10
-	add	r9,r9,r14
-	vshr.u32	q1,q12,#25
-	mov	r4,r4,ror#20
-	vshr.u32	q5,q13,#25
-	eor	r7,r7,r8,ror#20
-	vshr.u32	q9,q14,#25
-	eor	r4,r4,r9,ror#20
-	vsli.32	q1,q12,#7
-	add	r2,r2,r7
-	vsli.32	q5,q13,#7
-	mov	r10,r10,ror#24
-	vsli.32	q9,q14,#7
-	add	r3,r3,r4
-	vext.8	q2,q2,q2,#8
-	mov	r14,r14,ror#24
-	vext.8	q6,q6,q6,#8
-	eor	r10,r10,r2,ror#24
-	vext.8	q10,q10,q10,#8
-	eor	r14,r14,r3,ror#24
-	vext.8	q1,q1,q1,#12
-	add	r8,r8,r10
-	vext.8	q5,q5,q5,#12
-	mov	r7,r7,ror#25
-	vext.8	q9,q9,q9,#12
-	add	r9,r9,r14
-	vext.8	q3,q3,q3,#4
-	mov	r4,r4,ror#25
-	vext.8	q7,q7,q7,#4
-	eor	r7,r7,r8,ror#25
-	vext.8	q11,q11,q11,#4
-	eor	r4,r4,r9,ror#25
-	bne	Loop_neon
-
-	add	r11,sp,#32
-	vld1.32	{q12,q13},[sp]		@ load key material
-	vld1.32	{q14,q15},[r11]
-
-	ldr	r11,[sp,#4*(32+2)]	@ load len
-
-	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
-	str	r9, [sp,#4*(16+9)]
-	str	r12,[sp,#4*(16+12)]
-	str	r10, [sp,#4*(16+13)]
-	str	r14,[sp,#4*(16+14)]
-
-	@ at this point we have first half of 512-bit result in
-	@ rx and second half at sp+4*(16+8)
-
-	ldr	r12,[sp,#4*(32+1)]	@ load inp
-	ldr	r14,[sp,#4*(32+0)]	@ load out
-
-	vadd.i32	q0,q0,q12		@ accumulate key material
-	vadd.i32	q4,q4,q12
-	vadd.i32	q8,q8,q12
-	vldr	d24,[sp,#4*(16+0)]	@ one
-
-	vadd.i32	q1,q1,q13
-	vadd.i32	q5,q5,q13
-	vadd.i32	q9,q9,q13
-	vldr	d26,[sp,#4*(16+2)]	@ two
-
-	vadd.i32	q2,q2,q14
-	vadd.i32	q6,q6,q14
-	vadd.i32	q10,q10,q14
-	vadd.i32	d14,d14,d24	@ counter+1
-	vadd.i32	d22,d22,d26	@ counter+2
-
-	vadd.i32	q3,q3,q15
-	vadd.i32	q7,q7,q15
-	vadd.i32	q11,q11,q15
-
-	cmp	r11,#64*4
-	blo	Ltail_neon
-
-	vld1.8	{q12,q13},[r12]!	@ load input
-	mov	r11,sp
-	vld1.8	{q14,q15},[r12]!
-	veor	q0,q0,q12		@ xor with input
-	veor	q1,q1,q13
-	vld1.8	{q12,q13},[r12]!
-	veor	q2,q2,q14
-	veor	q3,q3,q15
-	vld1.8	{q14,q15},[r12]!
-
-	veor	q4,q4,q12
-	vst1.8	{q0,q1},[r14]!	@ store output
-	veor	q5,q5,q13
-	vld1.8	{q12,q13},[r12]!
-	veor	q6,q6,q14
-	vst1.8	{q2,q3},[r14]!
-	veor	q7,q7,q15
-	vld1.8	{q14,q15},[r12]!
-
-	veor	q8,q8,q12
-	vld1.32	{q0,q1},[r11]!	@ load for next iteration
-	veor	d25,d25,d25
-	vldr	d24,[sp,#4*(16+4)]	@ four
-	veor	q9,q9,q13
-	vld1.32	{q2,q3},[r11]
-	veor	q10,q10,q14
-	vst1.8	{q4,q5},[r14]!
-	veor	q11,q11,q15
-	vst1.8	{q6,q7},[r14]!
-
-	vadd.i32	d6,d6,d24	@ next counter value
-	vldr	d24,[sp,#4*(16+0)]	@ one
-
-	ldmia	sp,{r8,r9,r10,r11}	@ load key material
-	add	r0,r0,r8	@ accumulate key material
-	ldr	r8,[r12],#16		@ load input
-	vst1.8	{q8,q9},[r14]!
-	add	r1,r1,r9
-	ldr	r9,[r12,#-12]
-	vst1.8	{q10,q11},[r14]!
-	add	r2,r2,r10
-	ldr	r10,[r12,#-8]
-	add	r3,r3,r11
-	ldr	r11,[r12,#-4]
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-	eor	r0,r0,r8	@ xor with input
-	add	r8,sp,#4*(4)
-	eor	r1,r1,r9
-	str	r0,[r14],#16		@ store output
-	eor	r2,r2,r10
-	str	r1,[r14,#-12]
-	eor	r3,r3,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r2,[r14,#-8]
-	str	r3,[r14,#-4]
-
-	add	r4,r4,r8	@ accumulate key material
-	ldr	r8,[r12],#16		@ load input
-	add	r5,r5,r9
-	ldr	r9,[r12,#-12]
-	add	r6,r6,r10
-	ldr	r10,[r12,#-8]
-	add	r7,r7,r11
-	ldr	r11,[r12,#-4]
-# ifdef	__ARMEB__
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-	eor	r4,r4,r8
-	add	r8,sp,#4*(8)
-	eor	r5,r5,r9
-	str	r4,[r14],#16		@ store output
-	eor	r6,r6,r10
-	str	r5,[r14,#-12]
-	eor	r7,r7,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r6,[r14,#-8]
-	add	r0,sp,#4*(16+8)
-	str	r7,[r14,#-4]
-
-	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
-
-	add	r0,r0,r8	@ accumulate key material
-	ldr	r8,[r12],#16		@ load input
-	add	r1,r1,r9
-	ldr	r9,[r12,#-12]
-# ifdef	__thumb2__
-	it	hi
-# endif
-	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
-	add	r2,r2,r10
-	ldr	r10,[r12,#-8]
-# ifdef	__thumb2__
-	it	hi
-# endif
-	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
-	add	r3,r3,r11
-	ldr	r11,[r12,#-4]
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-	eor	r0,r0,r8
-	add	r8,sp,#4*(12)
-	eor	r1,r1,r9
-	str	r0,[r14],#16		@ store output
-	eor	r2,r2,r10
-	str	r1,[r14,#-12]
-	eor	r3,r3,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r2,[r14,#-8]
-	str	r3,[r14,#-4]
-
-	add	r4,r4,r8	@ accumulate key material
-	add	r8,r8,#4		@ next counter value
-	add	r5,r5,r9
-	str	r8,[sp,#4*(12)]	@ save next counter value
-	ldr	r8,[r12],#16		@ load input
-	add	r6,r6,r10
-	add	r4,r4,#3		@ counter+3
-	ldr	r9,[r12,#-12]
-	add	r7,r7,r11
-	ldr	r10,[r12,#-8]
-	ldr	r11,[r12,#-4]
-# ifdef	__ARMEB__
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-	eor	r4,r4,r8
-# ifdef	__thumb2__
-	it	hi
-# endif
-	ldrhi	r8,[sp,#4*(32+2)]	@ re-load len
-	eor	r5,r5,r9
-	eor	r6,r6,r10
-	str	r4,[r14],#16		@ store output
-	eor	r7,r7,r11
-	str	r5,[r14,#-12]
-	sub	r11,r8,#64*4	@ len-=64*4
-	str	r6,[r14,#-8]
-	str	r7,[r14,#-4]
-	bhi	Loop_neon_outer
-
-	b	Ldone_neon
-
-.align	4
-Lbreak_neon:
-	@ harmonize NEON and integer-only stack frames: load data
-	@ from NEON frame, but save to integer-only one; distance
-	@ between the two is 4*(32+4+16-32)=4*(20).
-
-	str	r11, [sp,#4*(20+32+2)]	@ save len
-	add	r11,sp,#4*(32+4)
-	str	r12,   [sp,#4*(20+32+1)]	@ save inp
-	str	r14,   [sp,#4*(20+32+0)]	@ save out
-
-	ldr	r12,[sp,#4*(16+10)]
-	ldr	r14,[sp,#4*(16+11)]
-	vldmia	r11,{d8,d9,d10,d11,d12,d13,d14,d15}			@ fulfill ABI requirement
-	str	r12,[sp,#4*(20+16+10)]	@ copy "rx"
-	str	r14,[sp,#4*(20+16+11)]	@ copy "rx"
-
-	ldr	r11, [sp,#4*(15)]
-	ldr	r12,[sp,#4*(12)]		@ modulo-scheduled load
-	ldr	r10, [sp,#4*(13)]
-	ldr	r14,[sp,#4*(14)]
-	str	r11, [sp,#4*(20+16+15)]
-	add	r11,sp,#4*(20)
-	vst1.32	{q0,q1},[r11]!		@ copy key
-	add	sp,sp,#4*(20)			@ switch frame
-	vst1.32	{q2,q3},[r11]
-	mov	r11,#10
-	b	Loop				@ go integer-only
-
-.align	4
-Ltail_neon:
-	cmp	r11,#64*3
-	bhs	L192_or_more_neon
-	cmp	r11,#64*2
-	bhs	L128_or_more_neon
-	cmp	r11,#64*1
-	bhs	L64_or_more_neon
-
-	add	r8,sp,#4*(8)
-	vst1.8	{q0,q1},[sp]
-	add	r10,sp,#4*(0)
-	vst1.8	{q2,q3},[r8]
-	b	Loop_tail_neon
-
-.align	4
-L64_or_more_neon:
-	vld1.8	{q12,q13},[r12]!
-	vld1.8	{q14,q15},[r12]!
-	veor	q0,q0,q12
-	veor	q1,q1,q13
-	veor	q2,q2,q14
-	veor	q3,q3,q15
-	vst1.8	{q0,q1},[r14]!
-	vst1.8	{q2,q3},[r14]!
-
-	beq	Ldone_neon
-
-	add	r8,sp,#4*(8)
-	vst1.8	{q4,q5},[sp]
-	add	r10,sp,#4*(0)
-	vst1.8	{q6,q7},[r8]
-	sub	r11,r11,#64*1	@ len-=64*1
-	b	Loop_tail_neon
-
-.align	4
-L128_or_more_neon:
-	vld1.8	{q12,q13},[r12]!
-	vld1.8	{q14,q15},[r12]!
-	veor	q0,q0,q12
-	veor	q1,q1,q13
-	vld1.8	{q12,q13},[r12]!
-	veor	q2,q2,q14
-	veor	q3,q3,q15
-	vld1.8	{q14,q15},[r12]!
-
-	veor	q4,q4,q12
-	veor	q5,q5,q13
-	vst1.8	{q0,q1},[r14]!
-	veor	q6,q6,q14
-	vst1.8	{q2,q3},[r14]!
-	veor	q7,q7,q15
-	vst1.8	{q4,q5},[r14]!
-	vst1.8	{q6,q7},[r14]!
-
-	beq	Ldone_neon
-
-	add	r8,sp,#4*(8)
-	vst1.8	{q8,q9},[sp]
-	add	r10,sp,#4*(0)
-	vst1.8	{q10,q11},[r8]
-	sub	r11,r11,#64*2	@ len-=64*2
-	b	Loop_tail_neon
-
-.align	4
-L192_or_more_neon:
-	vld1.8	{q12,q13},[r12]!
-	vld1.8	{q14,q15},[r12]!
-	veor	q0,q0,q12
-	veor	q1,q1,q13
-	vld1.8	{q12,q13},[r12]!
-	veor	q2,q2,q14
-	veor	q3,q3,q15
-	vld1.8	{q14,q15},[r12]!
-
-	veor	q4,q4,q12
-	veor	q5,q5,q13
-	vld1.8	{q12,q13},[r12]!
-	veor	q6,q6,q14
-	vst1.8	{q0,q1},[r14]!
-	veor	q7,q7,q15
-	vld1.8	{q14,q15},[r12]!
-
-	veor	q8,q8,q12
-	vst1.8	{q2,q3},[r14]!
-	veor	q9,q9,q13
-	vst1.8	{q4,q5},[r14]!
-	veor	q10,q10,q14
-	vst1.8	{q6,q7},[r14]!
-	veor	q11,q11,q15
-	vst1.8	{q8,q9},[r14]!
-	vst1.8	{q10,q11},[r14]!
-
-	beq	Ldone_neon
-
-	ldmia	sp,{r8,r9,r10,r11}	@ load key material
-	add	r0,r0,r8	@ accumulate key material
-	add	r8,sp,#4*(4)
-	add	r1,r1,r9
-	add	r2,r2,r10
-	add	r3,r3,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-
-	add	r4,r4,r8	@ accumulate key material
-	add	r8,sp,#4*(8)
-	add	r5,r5,r9
-	add	r6,r6,r10
-	add	r7,r7,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-	stmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7}
-	add	r0,sp,#4*(16+8)
-
-	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
-
-	add	r0,r0,r8	@ accumulate key material
-	add	r8,sp,#4*(12)
-	add	r1,r1,r9
-	add	r2,r2,r10
-	add	r3,r3,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-
-	add	r4,r4,r8	@ accumulate key material
-	add	r8,sp,#4*(8)
-	add	r5,r5,r9
-	add	r4,r4,#3		@ counter+3
-	add	r6,r6,r10
-	add	r7,r7,r11
-	ldr	r11,[sp,#4*(32+2)]	@ re-load len
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-	stmia	r8,{r0,r1,r2,r3,r4,r5,r6,r7}
-	add	r10,sp,#4*(0)
-	sub	r11,r11,#64*3	@ len-=64*3
-
-Loop_tail_neon:
-	ldrb	r8,[r10],#1	@ read buffer on stack
-	ldrb	r9,[r12],#1		@ read input
-	subs	r11,r11,#1
-	eor	r8,r8,r9
-	strb	r8,[r14],#1		@ store output
-	bne	Loop_tail_neon
-
-Ldone_neon:
-	add	sp,sp,#4*(32+4)
-	vldmia	sp,{d8,d9,d10,d11,d12,d13,d14,d15}
-	add	sp,sp,#4*(16+3)
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
-
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
-#endif  // defined(__arm__) && defined(__APPLE__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-windows.windows.x86.S
deleted file mode 100644
index ef88da19..00000000
--- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-windows.windows.x86.S
+++ /dev/null
@@ -1,973 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-global	_ChaCha20_ctr32_nohw
-align	16
-_ChaCha20_ctr32_nohw:
-L$_ChaCha20_ctr32_nohw_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [32+esp]
-	mov	edi,DWORD [36+esp]
-	sub	esp,132
-	mov	eax,DWORD [esi]
-	mov	ebx,DWORD [4+esi]
-	mov	ecx,DWORD [8+esi]
-	mov	edx,DWORD [12+esi]
-	mov	DWORD [80+esp],eax
-	mov	DWORD [84+esp],ebx
-	mov	DWORD [88+esp],ecx
-	mov	DWORD [92+esp],edx
-	mov	eax,DWORD [16+esi]
-	mov	ebx,DWORD [20+esi]
-	mov	ecx,DWORD [24+esi]
-	mov	edx,DWORD [28+esi]
-	mov	DWORD [96+esp],eax
-	mov	DWORD [100+esp],ebx
-	mov	DWORD [104+esp],ecx
-	mov	DWORD [108+esp],edx
-	mov	eax,DWORD [edi]
-	mov	ebx,DWORD [4+edi]
-	mov	ecx,DWORD [8+edi]
-	mov	edx,DWORD [12+edi]
-	sub	eax,1
-	mov	DWORD [112+esp],eax
-	mov	DWORD [116+esp],ebx
-	mov	DWORD [120+esp],ecx
-	mov	DWORD [124+esp],edx
-	jmp	NEAR L$000entry
-align	16
-L$001outer_loop:
-	mov	DWORD [156+esp],ebx
-	mov	DWORD [152+esp],eax
-	mov	DWORD [160+esp],ecx
-L$000entry:
-	mov	eax,1634760805
-	mov	DWORD [4+esp],857760878
-	mov	DWORD [8+esp],2036477234
-	mov	DWORD [12+esp],1797285236
-	mov	ebx,DWORD [84+esp]
-	mov	ebp,DWORD [88+esp]
-	mov	ecx,DWORD [104+esp]
-	mov	esi,DWORD [108+esp]
-	mov	edx,DWORD [116+esp]
-	mov	edi,DWORD [120+esp]
-	mov	DWORD [20+esp],ebx
-	mov	DWORD [24+esp],ebp
-	mov	DWORD [40+esp],ecx
-	mov	DWORD [44+esp],esi
-	mov	DWORD [52+esp],edx
-	mov	DWORD [56+esp],edi
-	mov	ebx,DWORD [92+esp]
-	mov	edi,DWORD [124+esp]
-	mov	edx,DWORD [112+esp]
-	mov	ebp,DWORD [80+esp]
-	mov	ecx,DWORD [96+esp]
-	mov	esi,DWORD [100+esp]
-	add	edx,1
-	mov	DWORD [28+esp],ebx
-	mov	DWORD [60+esp],edi
-	mov	DWORD [112+esp],edx
-	mov	ebx,10
-	jmp	NEAR L$002loop
-align	16
-L$002loop:
-	add	eax,ebp
-	mov	DWORD [128+esp],ebx
-	mov	ebx,ebp
-	xor	edx,eax
-	rol	edx,16
-	add	ecx,edx
-	xor	ebx,ecx
-	mov	edi,DWORD [52+esp]
-	rol	ebx,12
-	mov	ebp,DWORD [20+esp]
-	add	eax,ebx
-	xor	edx,eax
-	mov	DWORD [esp],eax
-	rol	edx,8
-	mov	eax,DWORD [4+esp]
-	add	ecx,edx
-	mov	DWORD [48+esp],edx
-	xor	ebx,ecx
-	add	eax,ebp
-	rol	ebx,7
-	xor	edi,eax
-	mov	DWORD [32+esp],ecx
-	rol	edi,16
-	mov	DWORD [16+esp],ebx
-	add	esi,edi
-	mov	ecx,DWORD [40+esp]
-	xor	ebp,esi
-	mov	edx,DWORD [56+esp]
-	rol	ebp,12
-	mov	ebx,DWORD [24+esp]
-	add	eax,ebp
-	xor	edi,eax
-	mov	DWORD [4+esp],eax
-	rol	edi,8
-	mov	eax,DWORD [8+esp]
-	add	esi,edi
-	mov	DWORD [52+esp],edi
-	xor	ebp,esi
-	add	eax,ebx
-	rol	ebp,7
-	xor	edx,eax
-	mov	DWORD [36+esp],esi
-	rol	edx,16
-	mov	DWORD [20+esp],ebp
-	add	ecx,edx
-	mov	esi,DWORD [44+esp]
-	xor	ebx,ecx
-	mov	edi,DWORD [60+esp]
-	rol	ebx,12
-	mov	ebp,DWORD [28+esp]
-	add	eax,ebx
-	xor	edx,eax
-	mov	DWORD [8+esp],eax
-	rol	edx,8
-	mov	eax,DWORD [12+esp]
-	add	ecx,edx
-	mov	DWORD [56+esp],edx
-	xor	ebx,ecx
-	add	eax,ebp
-	rol	ebx,7
-	xor	edi,eax
-	rol	edi,16
-	mov	DWORD [24+esp],ebx
-	add	esi,edi
-	xor	ebp,esi
-	rol	ebp,12
-	mov	ebx,DWORD [20+esp]
-	add	eax,ebp
-	xor	edi,eax
-	mov	DWORD [12+esp],eax
-	rol	edi,8
-	mov	eax,DWORD [esp]
-	add	esi,edi
-	mov	edx,edi
-	xor	ebp,esi
-	add	eax,ebx
-	rol	ebp,7
-	xor	edx,eax
-	rol	edx,16
-	mov	DWORD [28+esp],ebp
-	add	ecx,edx
-	xor	ebx,ecx
-	mov	edi,DWORD [48+esp]
-	rol	ebx,12
-	mov	ebp,DWORD [24+esp]
-	add	eax,ebx
-	xor	edx,eax
-	mov	DWORD [esp],eax
-	rol	edx,8
-	mov	eax,DWORD [4+esp]
-	add	ecx,edx
-	mov	DWORD [60+esp],edx
-	xor	ebx,ecx
-	add	eax,ebp
-	rol	ebx,7
-	xor	edi,eax
-	mov	DWORD [40+esp],ecx
-	rol	edi,16
-	mov	DWORD [20+esp],ebx
-	add	esi,edi
-	mov	ecx,DWORD [32+esp]
-	xor	ebp,esi
-	mov	edx,DWORD [52+esp]
-	rol	ebp,12
-	mov	ebx,DWORD [28+esp]
-	add	eax,ebp
-	xor	edi,eax
-	mov	DWORD [4+esp],eax
-	rol	edi,8
-	mov	eax,DWORD [8+esp]
-	add	esi,edi
-	mov	DWORD [48+esp],edi
-	xor	ebp,esi
-	add	eax,ebx
-	rol	ebp,7
-	xor	edx,eax
-	mov	DWORD [44+esp],esi
-	rol	edx,16
-	mov	DWORD [24+esp],ebp
-	add	ecx,edx
-	mov	esi,DWORD [36+esp]
-	xor	ebx,ecx
-	mov	edi,DWORD [56+esp]
-	rol	ebx,12
-	mov	ebp,DWORD [16+esp]
-	add	eax,ebx
-	xor	edx,eax
-	mov	DWORD [8+esp],eax
-	rol	edx,8
-	mov	eax,DWORD [12+esp]
-	add	ecx,edx
-	mov	DWORD [52+esp],edx
-	xor	ebx,ecx
-	add	eax,ebp
-	rol	ebx,7
-	xor	edi,eax
-	rol	edi,16
-	mov	DWORD [28+esp],ebx
-	add	esi,edi
-	xor	ebp,esi
-	mov	edx,DWORD [48+esp]
-	rol	ebp,12
-	mov	ebx,DWORD [128+esp]
-	add	eax,ebp
-	xor	edi,eax
-	mov	DWORD [12+esp],eax
-	rol	edi,8
-	mov	eax,DWORD [esp]
-	add	esi,edi
-	mov	DWORD [56+esp],edi
-	xor	ebp,esi
-	rol	ebp,7
-	dec	ebx
-	jnz	NEAR L$002loop
-	mov	ebx,DWORD [160+esp]
-	add	eax,1634760805
-	add	ebp,DWORD [80+esp]
-	add	ecx,DWORD [96+esp]
-	add	esi,DWORD [100+esp]
-	cmp	ebx,64
-	jb	NEAR L$003tail
-	mov	ebx,DWORD [156+esp]
-	add	edx,DWORD [112+esp]
-	add	edi,DWORD [120+esp]
-	xor	eax,DWORD [ebx]
-	xor	ebp,DWORD [16+ebx]
-	mov	DWORD [esp],eax
-	mov	eax,DWORD [152+esp]
-	xor	ecx,DWORD [32+ebx]
-	xor	esi,DWORD [36+ebx]
-	xor	edx,DWORD [48+ebx]
-	xor	edi,DWORD [56+ebx]
-	mov	DWORD [16+eax],ebp
-	mov	DWORD [32+eax],ecx
-	mov	DWORD [36+eax],esi
-	mov	DWORD [48+eax],edx
-	mov	DWORD [56+eax],edi
-	mov	ebp,DWORD [4+esp]
-	mov	ecx,DWORD [8+esp]
-	mov	esi,DWORD [12+esp]
-	mov	edx,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	add	ebp,857760878
-	add	ecx,2036477234
-	add	esi,1797285236
-	add	edx,DWORD [84+esp]
-	add	edi,DWORD [88+esp]
-	xor	ebp,DWORD [4+ebx]
-	xor	ecx,DWORD [8+ebx]
-	xor	esi,DWORD [12+ebx]
-	xor	edx,DWORD [20+ebx]
-	xor	edi,DWORD [24+ebx]
-	mov	DWORD [4+eax],ebp
-	mov	DWORD [8+eax],ecx
-	mov	DWORD [12+eax],esi
-	mov	DWORD [20+eax],edx
-	mov	DWORD [24+eax],edi
-	mov	ebp,DWORD [28+esp]
-	mov	ecx,DWORD [40+esp]
-	mov	esi,DWORD [44+esp]
-	mov	edx,DWORD [52+esp]
-	mov	edi,DWORD [60+esp]
-	add	ebp,DWORD [92+esp]
-	add	ecx,DWORD [104+esp]
-	add	esi,DWORD [108+esp]
-	add	edx,DWORD [116+esp]
-	add	edi,DWORD [124+esp]
-	xor	ebp,DWORD [28+ebx]
-	xor	ecx,DWORD [40+ebx]
-	xor	esi,DWORD [44+ebx]
-	xor	edx,DWORD [52+ebx]
-	xor	edi,DWORD [60+ebx]
-	lea	ebx,[64+ebx]
-	mov	DWORD [28+eax],ebp
-	mov	ebp,DWORD [esp]
-	mov	DWORD [40+eax],ecx
-	mov	ecx,DWORD [160+esp]
-	mov	DWORD [44+eax],esi
-	mov	DWORD [52+eax],edx
-	mov	DWORD [60+eax],edi
-	mov	DWORD [eax],ebp
-	lea	eax,[64+eax]
-	sub	ecx,64
-	jnz	NEAR L$001outer_loop
-	jmp	NEAR L$004done
-L$003tail:
-	add	edx,DWORD [112+esp]
-	add	edi,DWORD [120+esp]
-	mov	DWORD [esp],eax
-	mov	DWORD [16+esp],ebp
-	mov	DWORD [32+esp],ecx
-	mov	DWORD [36+esp],esi
-	mov	DWORD [48+esp],edx
-	mov	DWORD [56+esp],edi
-	mov	ebp,DWORD [4+esp]
-	mov	ecx,DWORD [8+esp]
-	mov	esi,DWORD [12+esp]
-	mov	edx,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	add	ebp,857760878
-	add	ecx,2036477234
-	add	esi,1797285236
-	add	edx,DWORD [84+esp]
-	add	edi,DWORD [88+esp]
-	mov	DWORD [4+esp],ebp
-	mov	DWORD [8+esp],ecx
-	mov	DWORD [12+esp],esi
-	mov	DWORD [20+esp],edx
-	mov	DWORD [24+esp],edi
-	mov	ebp,DWORD [28+esp]
-	mov	ecx,DWORD [40+esp]
-	mov	esi,DWORD [44+esp]
-	mov	edx,DWORD [52+esp]
-	mov	edi,DWORD [60+esp]
-	add	ebp,DWORD [92+esp]
-	add	ecx,DWORD [104+esp]
-	add	esi,DWORD [108+esp]
-	add	edx,DWORD [116+esp]
-	add	edi,DWORD [124+esp]
-	mov	DWORD [28+esp],ebp
-	mov	ebp,DWORD [156+esp]
-	mov	DWORD [40+esp],ecx
-	mov	ecx,DWORD [152+esp]
-	mov	DWORD [44+esp],esi
-	xor	esi,esi
-	mov	DWORD [52+esp],edx
-	mov	DWORD [60+esp],edi
-	xor	eax,eax
-	xor	edx,edx
-L$005tail_loop:
-	mov	al,BYTE [ebp*1+esi]
-	mov	dl,BYTE [esi*1+esp]
-	lea	esi,[1+esi]
-	xor	al,dl
-	mov	BYTE [esi*1+ecx-1],al
-	dec	ebx
-	jnz	NEAR L$005tail_loop
-L$004done:
-	add	esp,132
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_ChaCha20_ctr32_ssse3
-align	16
-_ChaCha20_ctr32_ssse3:
-L$_ChaCha20_ctr32_ssse3_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	call	L$pic_point
-L$pic_point:
-	pop	eax
-	mov	edi,DWORD [20+esp]
-	mov	esi,DWORD [24+esp]
-	mov	ecx,DWORD [28+esp]
-	mov	edx,DWORD [32+esp]
-	mov	ebx,DWORD [36+esp]
-	mov	ebp,esp
-	sub	esp,524
-	and	esp,-64
-	mov	DWORD [512+esp],ebp
-	lea	eax,[(L$ssse3_data-L$pic_point)+eax]
-	movdqu	xmm3,[ebx]
-	cmp	ecx,256
-	jb	NEAR L$0061x
-	mov	DWORD [516+esp],edx
-	mov	DWORD [520+esp],ebx
-	sub	ecx,256
-	lea	ebp,[384+esp]
-	movdqu	xmm7,[edx]
-	pshufd	xmm0,xmm3,0
-	pshufd	xmm1,xmm3,85
-	pshufd	xmm2,xmm3,170
-	pshufd	xmm3,xmm3,255
-	paddd	xmm0,[48+eax]
-	pshufd	xmm4,xmm7,0
-	pshufd	xmm5,xmm7,85
-	psubd	xmm0,[64+eax]
-	pshufd	xmm6,xmm7,170
-	pshufd	xmm7,xmm7,255
-	movdqa	[64+ebp],xmm0
-	movdqa	[80+ebp],xmm1
-	movdqa	[96+ebp],xmm2
-	movdqa	[112+ebp],xmm3
-	movdqu	xmm3,[16+edx]
-	movdqa	[ebp-64],xmm4
-	movdqa	[ebp-48],xmm5
-	movdqa	[ebp-32],xmm6
-	movdqa	[ebp-16],xmm7
-	movdqa	xmm7,[32+eax]
-	lea	ebx,[128+esp]
-	pshufd	xmm0,xmm3,0
-	pshufd	xmm1,xmm3,85
-	pshufd	xmm2,xmm3,170
-	pshufd	xmm3,xmm3,255
-	pshufd	xmm4,xmm7,0
-	pshufd	xmm5,xmm7,85
-	pshufd	xmm6,xmm7,170
-	pshufd	xmm7,xmm7,255
-	movdqa	[ebp],xmm0
-	movdqa	[16+ebp],xmm1
-	movdqa	[32+ebp],xmm2
-	movdqa	[48+ebp],xmm3
-	movdqa	[ebp-128],xmm4
-	movdqa	[ebp-112],xmm5
-	movdqa	[ebp-96],xmm6
-	movdqa	[ebp-80],xmm7
-	lea	esi,[128+esi]
-	lea	edi,[128+edi]
-	jmp	NEAR L$007outer_loop
-align	16
-L$007outer_loop:
-	movdqa	xmm1,[ebp-112]
-	movdqa	xmm2,[ebp-96]
-	movdqa	xmm3,[ebp-80]
-	movdqa	xmm5,[ebp-48]
-	movdqa	xmm6,[ebp-32]
-	movdqa	xmm7,[ebp-16]
-	movdqa	[ebx-112],xmm1
-	movdqa	[ebx-96],xmm2
-	movdqa	[ebx-80],xmm3
-	movdqa	[ebx-48],xmm5
-	movdqa	[ebx-32],xmm6
-	movdqa	[ebx-16],xmm7
-	movdqa	xmm2,[32+ebp]
-	movdqa	xmm3,[48+ebp]
-	movdqa	xmm4,[64+ebp]
-	movdqa	xmm5,[80+ebp]
-	movdqa	xmm6,[96+ebp]
-	movdqa	xmm7,[112+ebp]
-	paddd	xmm4,[64+eax]
-	movdqa	[32+ebx],xmm2
-	movdqa	[48+ebx],xmm3
-	movdqa	[64+ebx],xmm4
-	movdqa	[80+ebx],xmm5
-	movdqa	[96+ebx],xmm6
-	movdqa	[112+ebx],xmm7
-	movdqa	[64+ebp],xmm4
-	movdqa	xmm0,[ebp-128]
-	movdqa	xmm6,xmm4
-	movdqa	xmm3,[ebp-64]
-	movdqa	xmm4,[ebp]
-	movdqa	xmm5,[16+ebp]
-	mov	edx,10
-	nop
-align	16
-L$008loop:
-	paddd	xmm0,xmm3
-	movdqa	xmm2,xmm3
-	pxor	xmm6,xmm0
-	pshufb	xmm6,[eax]
-	paddd	xmm4,xmm6
-	pxor	xmm2,xmm4
-	movdqa	xmm3,[ebx-48]
-	movdqa	xmm1,xmm2
-	pslld	xmm2,12
-	psrld	xmm1,20
-	por	xmm2,xmm1
-	movdqa	xmm1,[ebx-112]
-	paddd	xmm0,xmm2
-	movdqa	xmm7,[80+ebx]
-	pxor	xmm6,xmm0
-	movdqa	[ebx-128],xmm0
-	pshufb	xmm6,[16+eax]
-	paddd	xmm4,xmm6
-	movdqa	[64+ebx],xmm6
-	pxor	xmm2,xmm4
-	paddd	xmm1,xmm3
-	movdqa	xmm0,xmm2
-	pslld	xmm2,7
-	psrld	xmm0,25
-	pxor	xmm7,xmm1
-	por	xmm2,xmm0
-	movdqa	[ebx],xmm4
-	pshufb	xmm7,[eax]
-	movdqa	[ebx-64],xmm2
-	paddd	xmm5,xmm7
-	movdqa	xmm4,[32+ebx]
-	pxor	xmm3,xmm5
-	movdqa	xmm2,[ebx-32]
-	movdqa	xmm0,xmm3
-	pslld	xmm3,12
-	psrld	xmm0,20
-	por	xmm3,xmm0
-	movdqa	xmm0,[ebx-96]
-	paddd	xmm1,xmm3
-	movdqa	xmm6,[96+ebx]
-	pxor	xmm7,xmm1
-	movdqa	[ebx-112],xmm1
-	pshufb	xmm7,[16+eax]
-	paddd	xmm5,xmm7
-	movdqa	[80+ebx],xmm7
-	pxor	xmm3,xmm5
-	paddd	xmm0,xmm2
-	movdqa	xmm1,xmm3
-	pslld	xmm3,7
-	psrld	xmm1,25
-	pxor	xmm6,xmm0
-	por	xmm3,xmm1
-	movdqa	[16+ebx],xmm5
-	pshufb	xmm6,[eax]
-	movdqa	[ebx-48],xmm3
-	paddd	xmm4,xmm6
-	movdqa	xmm5,[48+ebx]
-	pxor	xmm2,xmm4
-	movdqa	xmm3,[ebx-16]
-	movdqa	xmm1,xmm2
-	pslld	xmm2,12
-	psrld	xmm1,20
-	por	xmm2,xmm1
-	movdqa	xmm1,[ebx-80]
-	paddd	xmm0,xmm2
-	movdqa	xmm7,[112+ebx]
-	pxor	xmm6,xmm0
-	movdqa	[ebx-96],xmm0
-	pshufb	xmm6,[16+eax]
-	paddd	xmm4,xmm6
-	movdqa	[96+ebx],xmm6
-	pxor	xmm2,xmm4
-	paddd	xmm1,xmm3
-	movdqa	xmm0,xmm2
-	pslld	xmm2,7
-	psrld	xmm0,25
-	pxor	xmm7,xmm1
-	por	xmm2,xmm0
-	pshufb	xmm7,[eax]
-	movdqa	[ebx-32],xmm2
-	paddd	xmm5,xmm7
-	pxor	xmm3,xmm5
-	movdqa	xmm2,[ebx-48]
-	movdqa	xmm0,xmm3
-	pslld	xmm3,12
-	psrld	xmm0,20
-	por	xmm3,xmm0
-	movdqa	xmm0,[ebx-128]
-	paddd	xmm1,xmm3
-	pxor	xmm7,xmm1
-	movdqa	[ebx-80],xmm1
-	pshufb	xmm7,[16+eax]
-	paddd	xmm5,xmm7
-	movdqa	xmm6,xmm7
-	pxor	xmm3,xmm5
-	paddd	xmm0,xmm2
-	movdqa	xmm1,xmm3
-	pslld	xmm3,7
-	psrld	xmm1,25
-	pxor	xmm6,xmm0
-	por	xmm3,xmm1
-	pshufb	xmm6,[eax]
-	movdqa	[ebx-16],xmm3
-	paddd	xmm4,xmm6
-	pxor	xmm2,xmm4
-	movdqa	xmm3,[ebx-32]
-	movdqa	xmm1,xmm2
-	pslld	xmm2,12
-	psrld	xmm1,20
-	por	xmm2,xmm1
-	movdqa	xmm1,[ebx-112]
-	paddd	xmm0,xmm2
-	movdqa	xmm7,[64+ebx]
-	pxor	xmm6,xmm0
-	movdqa	[ebx-128],xmm0
-	pshufb	xmm6,[16+eax]
-	paddd	xmm4,xmm6
-	movdqa	[112+ebx],xmm6
-	pxor	xmm2,xmm4
-	paddd	xmm1,xmm3
-	movdqa	xmm0,xmm2
-	pslld	xmm2,7
-	psrld	xmm0,25
-	pxor	xmm7,xmm1
-	por	xmm2,xmm0
-	movdqa	[32+ebx],xmm4
-	pshufb	xmm7,[eax]
-	movdqa	[ebx-48],xmm2
-	paddd	xmm5,xmm7
-	movdqa	xmm4,[ebx]
-	pxor	xmm3,xmm5
-	movdqa	xmm2,[ebx-16]
-	movdqa	xmm0,xmm3
-	pslld	xmm3,12
-	psrld	xmm0,20
-	por	xmm3,xmm0
-	movdqa	xmm0,[ebx-96]
-	paddd	xmm1,xmm3
-	movdqa	xmm6,[80+ebx]
-	pxor	xmm7,xmm1
-	movdqa	[ebx-112],xmm1
-	pshufb	xmm7,[16+eax]
-	paddd	xmm5,xmm7
-	movdqa	[64+ebx],xmm7
-	pxor	xmm3,xmm5
-	paddd	xmm0,xmm2
-	movdqa	xmm1,xmm3
-	pslld	xmm3,7
-	psrld	xmm1,25
-	pxor	xmm6,xmm0
-	por	xmm3,xmm1
-	movdqa	[48+ebx],xmm5
-	pshufb	xmm6,[eax]
-	movdqa	[ebx-32],xmm3
-	paddd	xmm4,xmm6
-	movdqa	xmm5,[16+ebx]
-	pxor	xmm2,xmm4
-	movdqa	xmm3,[ebx-64]
-	movdqa	xmm1,xmm2
-	pslld	xmm2,12
-	psrld	xmm1,20
-	por	xmm2,xmm1
-	movdqa	xmm1,[ebx-80]
-	paddd	xmm0,xmm2
-	movdqa	xmm7,[96+ebx]
-	pxor	xmm6,xmm0
-	movdqa	[ebx-96],xmm0
-	pshufb	xmm6,[16+eax]
-	paddd	xmm4,xmm6
-	movdqa	[80+ebx],xmm6
-	pxor	xmm2,xmm4
-	paddd	xmm1,xmm3
-	movdqa	xmm0,xmm2
-	pslld	xmm2,7
-	psrld	xmm0,25
-	pxor	xmm7,xmm1
-	por	xmm2,xmm0
-	pshufb	xmm7,[eax]
-	movdqa	[ebx-16],xmm2
-	paddd	xmm5,xmm7
-	pxor	xmm3,xmm5
-	movdqa	xmm0,xmm3
-	pslld	xmm3,12
-	psrld	xmm0,20
-	por	xmm3,xmm0
-	movdqa	xmm0,[ebx-128]
-	paddd	xmm1,xmm3
-	movdqa	xmm6,[64+ebx]
-	pxor	xmm7,xmm1
-	movdqa	[ebx-80],xmm1
-	pshufb	xmm7,[16+eax]
-	paddd	xmm5,xmm7
-	movdqa	[96+ebx],xmm7
-	pxor	xmm3,xmm5
-	movdqa	xmm1,xmm3
-	pslld	xmm3,7
-	psrld	xmm1,25
-	por	xmm3,xmm1
-	dec	edx
-	jnz	NEAR L$008loop
-	movdqa	[ebx-64],xmm3
-	movdqa	[ebx],xmm4
-	movdqa	[16+ebx],xmm5
-	movdqa	[64+ebx],xmm6
-	movdqa	[96+ebx],xmm7
-	movdqa	xmm1,[ebx-112]
-	movdqa	xmm2,[ebx-96]
-	movdqa	xmm3,[ebx-80]
-	paddd	xmm0,[ebp-128]
-	paddd	xmm1,[ebp-112]
-	paddd	xmm2,[ebp-96]
-	paddd	xmm3,[ebp-80]
-	movdqa	xmm6,xmm0
-	punpckldq	xmm0,xmm1
-	movdqa	xmm7,xmm2
-	punpckldq	xmm2,xmm3
-	punpckhdq	xmm6,xmm1
-	punpckhdq	xmm7,xmm3
-	movdqa	xmm1,xmm0
-	punpcklqdq	xmm0,xmm2
-	movdqa	xmm3,xmm6
-	punpcklqdq	xmm6,xmm7
-	punpckhqdq	xmm1,xmm2
-	punpckhqdq	xmm3,xmm7
-	movdqu	xmm4,[esi-128]
-	movdqu	xmm5,[esi-64]
-	movdqu	xmm2,[esi]
-	movdqu	xmm7,[64+esi]
-	lea	esi,[16+esi]
-	pxor	xmm4,xmm0
-	movdqa	xmm0,[ebx-64]
-	pxor	xmm5,xmm1
-	movdqa	xmm1,[ebx-48]
-	pxor	xmm6,xmm2
-	movdqa	xmm2,[ebx-32]
-	pxor	xmm7,xmm3
-	movdqa	xmm3,[ebx-16]
-	movdqu	[edi-128],xmm4
-	movdqu	[edi-64],xmm5
-	movdqu	[edi],xmm6
-	movdqu	[64+edi],xmm7
-	lea	edi,[16+edi]
-	paddd	xmm0,[ebp-64]
-	paddd	xmm1,[ebp-48]
-	paddd	xmm2,[ebp-32]
-	paddd	xmm3,[ebp-16]
-	movdqa	xmm6,xmm0
-	punpckldq	xmm0,xmm1
-	movdqa	xmm7,xmm2
-	punpckldq	xmm2,xmm3
-	punpckhdq	xmm6,xmm1
-	punpckhdq	xmm7,xmm3
-	movdqa	xmm1,xmm0
-	punpcklqdq	xmm0,xmm2
-	movdqa	xmm3,xmm6
-	punpcklqdq	xmm6,xmm7
-	punpckhqdq	xmm1,xmm2
-	punpckhqdq	xmm3,xmm7
-	movdqu	xmm4,[esi-128]
-	movdqu	xmm5,[esi-64]
-	movdqu	xmm2,[esi]
-	movdqu	xmm7,[64+esi]
-	lea	esi,[16+esi]
-	pxor	xmm4,xmm0
-	movdqa	xmm0,[ebx]
-	pxor	xmm5,xmm1
-	movdqa	xmm1,[16+ebx]
-	pxor	xmm6,xmm2
-	movdqa	xmm2,[32+ebx]
-	pxor	xmm7,xmm3
-	movdqa	xmm3,[48+ebx]
-	movdqu	[edi-128],xmm4
-	movdqu	[edi-64],xmm5
-	movdqu	[edi],xmm6
-	movdqu	[64+edi],xmm7
-	lea	edi,[16+edi]
-	paddd	xmm0,[ebp]
-	paddd	xmm1,[16+ebp]
-	paddd	xmm2,[32+ebp]
-	paddd	xmm3,[48+ebp]
-	movdqa	xmm6,xmm0
-	punpckldq	xmm0,xmm1
-	movdqa	xmm7,xmm2
-	punpckldq	xmm2,xmm3
-	punpckhdq	xmm6,xmm1
-	punpckhdq	xmm7,xmm3
-	movdqa	xmm1,xmm0
-	punpcklqdq	xmm0,xmm2
-	movdqa	xmm3,xmm6
-	punpcklqdq	xmm6,xmm7
-	punpckhqdq	xmm1,xmm2
-	punpckhqdq	xmm3,xmm7
-	movdqu	xmm4,[esi-128]
-	movdqu	xmm5,[esi-64]
-	movdqu	xmm2,[esi]
-	movdqu	xmm7,[64+esi]
-	lea	esi,[16+esi]
-	pxor	xmm4,xmm0
-	movdqa	xmm0,[64+ebx]
-	pxor	xmm5,xmm1
-	movdqa	xmm1,[80+ebx]
-	pxor	xmm6,xmm2
-	movdqa	xmm2,[96+ebx]
-	pxor	xmm7,xmm3
-	movdqa	xmm3,[112+ebx]
-	movdqu	[edi-128],xmm4
-	movdqu	[edi-64],xmm5
-	movdqu	[edi],xmm6
-	movdqu	[64+edi],xmm7
-	lea	edi,[16+edi]
-	paddd	xmm0,[64+ebp]
-	paddd	xmm1,[80+ebp]
-	paddd	xmm2,[96+ebp]
-	paddd	xmm3,[112+ebp]
-	movdqa	xmm6,xmm0
-	punpckldq	xmm0,xmm1
-	movdqa	xmm7,xmm2
-	punpckldq	xmm2,xmm3
-	punpckhdq	xmm6,xmm1
-	punpckhdq	xmm7,xmm3
-	movdqa	xmm1,xmm0
-	punpcklqdq	xmm0,xmm2
-	movdqa	xmm3,xmm6
-	punpcklqdq	xmm6,xmm7
-	punpckhqdq	xmm1,xmm2
-	punpckhqdq	xmm3,xmm7
-	movdqu	xmm4,[esi-128]
-	movdqu	xmm5,[esi-64]
-	movdqu	xmm2,[esi]
-	movdqu	xmm7,[64+esi]
-	lea	esi,[208+esi]
-	pxor	xmm4,xmm0
-	pxor	xmm5,xmm1
-	pxor	xmm6,xmm2
-	pxor	xmm7,xmm3
-	movdqu	[edi-128],xmm4
-	movdqu	[edi-64],xmm5
-	movdqu	[edi],xmm6
-	movdqu	[64+edi],xmm7
-	lea	edi,[208+edi]
-	sub	ecx,256
-	jnc	NEAR L$007outer_loop
-	add	ecx,256
-	jz	NEAR L$009done
-	mov	ebx,DWORD [520+esp]
-	lea	esi,[esi-128]
-	mov	edx,DWORD [516+esp]
-	lea	edi,[edi-128]
-	movd	xmm2,DWORD [64+ebp]
-	movdqu	xmm3,[ebx]
-	paddd	xmm2,[96+eax]
-	pand	xmm3,[112+eax]
-	por	xmm3,xmm2
-L$0061x:
-	movdqa	xmm0,[32+eax]
-	movdqu	xmm1,[edx]
-	movdqu	xmm2,[16+edx]
-	movdqa	xmm6,[eax]
-	movdqa	xmm7,[16+eax]
-	mov	DWORD [48+esp],ebp
-	movdqa	[esp],xmm0
-	movdqa	[16+esp],xmm1
-	movdqa	[32+esp],xmm2
-	movdqa	[48+esp],xmm3
-	mov	edx,10
-	jmp	NEAR L$010loop1x
-align	16
-L$011outer1x:
-	movdqa	xmm3,[80+eax]
-	movdqa	xmm0,[esp]
-	movdqa	xmm1,[16+esp]
-	movdqa	xmm2,[32+esp]
-	paddd	xmm3,[48+esp]
-	mov	edx,10
-	movdqa	[48+esp],xmm3
-	jmp	NEAR L$010loop1x
-align	16
-L$010loop1x:
-	paddd	xmm0,xmm1
-	pxor	xmm3,xmm0
-db	102,15,56,0,222
-	paddd	xmm2,xmm3
-	pxor	xmm1,xmm2
-	movdqa	xmm4,xmm1
-	psrld	xmm1,20
-	pslld	xmm4,12
-	por	xmm1,xmm4
-	paddd	xmm0,xmm1
-	pxor	xmm3,xmm0
-db	102,15,56,0,223
-	paddd	xmm2,xmm3
-	pxor	xmm1,xmm2
-	movdqa	xmm4,xmm1
-	psrld	xmm1,25
-	pslld	xmm4,7
-	por	xmm1,xmm4
-	pshufd	xmm2,xmm2,78
-	pshufd	xmm1,xmm1,57
-	pshufd	xmm3,xmm3,147
-	nop
-	paddd	xmm0,xmm1
-	pxor	xmm3,xmm0
-db	102,15,56,0,222
-	paddd	xmm2,xmm3
-	pxor	xmm1,xmm2
-	movdqa	xmm4,xmm1
-	psrld	xmm1,20
-	pslld	xmm4,12
-	por	xmm1,xmm4
-	paddd	xmm0,xmm1
-	pxor	xmm3,xmm0
-db	102,15,56,0,223
-	paddd	xmm2,xmm3
-	pxor	xmm1,xmm2
-	movdqa	xmm4,xmm1
-	psrld	xmm1,25
-	pslld	xmm4,7
-	por	xmm1,xmm4
-	pshufd	xmm2,xmm2,78
-	pshufd	xmm1,xmm1,147
-	pshufd	xmm3,xmm3,57
-	dec	edx
-	jnz	NEAR L$010loop1x
-	paddd	xmm0,[esp]
-	paddd	xmm1,[16+esp]
-	paddd	xmm2,[32+esp]
-	paddd	xmm3,[48+esp]
-	cmp	ecx,64
-	jb	NEAR L$012tail
-	movdqu	xmm4,[esi]
-	movdqu	xmm5,[16+esi]
-	pxor	xmm0,xmm4
-	movdqu	xmm4,[32+esi]
-	pxor	xmm1,xmm5
-	movdqu	xmm5,[48+esi]
-	pxor	xmm2,xmm4
-	pxor	xmm3,xmm5
-	lea	esi,[64+esi]
-	movdqu	[edi],xmm0
-	movdqu	[16+edi],xmm1
-	movdqu	[32+edi],xmm2
-	movdqu	[48+edi],xmm3
-	lea	edi,[64+edi]
-	sub	ecx,64
-	jnz	NEAR L$011outer1x
-	jmp	NEAR L$009done
-L$012tail:
-	movdqa	[esp],xmm0
-	movdqa	[16+esp],xmm1
-	movdqa	[32+esp],xmm2
-	movdqa	[48+esp],xmm3
-	xor	eax,eax
-	xor	edx,edx
-	xor	ebp,ebp
-L$013tail_loop:
-	mov	al,BYTE [ebp*1+esp]
-	mov	dl,BYTE [ebp*1+esi]
-	lea	ebp,[1+ebp]
-	xor	al,dl
-	mov	BYTE [ebp*1+edi-1],al
-	dec	ecx
-	jnz	NEAR L$013tail_loop
-L$009done:
-	mov	esp,DWORD [512+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-align	64
-L$ssse3_data:
-db	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
-db	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
-dd	1634760805,857760878,2036477234,1797285236
-dd	0,1,2,3
-dd	4,4,4,4
-dd	1,0,0,0
-dd	4,0,0,0
-dd	0,-1,-1,-1
-align	64
-db	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
-db	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
-db	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
-db	114,103,62,0
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_des.c b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_des.c
index 3fabb19a..d5aeaf96 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_des.c
+++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_des.c
@@ -85,17 +85,14 @@ static int des_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
 }
 
 static const EVP_CIPHER evp_des_cbc = {
-    /* nid = */ NID_des_cbc,
-    /* block_size = */ 8,
-    /* key_len = */ 8,
-    /* iv_len = */ 8,
-    /* ctx_size = */ sizeof(EVP_DES_KEY),
-    /* flags = */ EVP_CIPH_CBC_MODE,
-    /* app_data = */ NULL,
-    /* init = */ des_init_key,
-    /* cipher = */ des_cbc_cipher,
-    /* cleanup = */ NULL,
-    /* ctrl = */ NULL,
+    .nid = NID_des_cbc,
+    .block_size = 8,
+    .key_len = 8,
+    .iv_len = 8,
+    .ctx_size = sizeof(EVP_DES_KEY),
+    .flags = EVP_CIPH_CBC_MODE,
+    .init = des_init_key,
+    .cipher = des_cbc_cipher,
 };
 
 const EVP_CIPHER *EVP_des_cbc(void) { return &evp_des_cbc; }
@@ -115,17 +112,14 @@ static int des_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
 }
 
 static const EVP_CIPHER evp_des_ecb = {
-    /* nid = */ NID_des_ecb,
-    /* block_size = */ 8,
-    /* key_len = */ 8,
-    /* iv_len = */ 0,
-    /* ctx_size = */ sizeof(EVP_DES_KEY),
-    /* flags = */ EVP_CIPH_ECB_MODE,
-    /* app_data = */ NULL,
-    /* init = */ des_init_key,
-    /* cipher = */ des_ecb_cipher,
-    /* cleanup = */ NULL,
-    /* ctrl = */ NULL,
+    .nid = NID_des_ecb,
+    .block_size = 8,
+    .key_len = 8,
+    .iv_len = 0,
+    .ctx_size = sizeof(EVP_DES_KEY),
+    .flags = EVP_CIPH_ECB_MODE,
+    .init = des_init_key,
+    .cipher = des_ecb_cipher,
 };
 
 const EVP_CIPHER *EVP_des_ecb(void) { return &evp_des_ecb; }
@@ -155,17 +149,14 @@ static int des_ede3_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out,
 }
 
 static const EVP_CIPHER evp_des_ede3_cbc = {
-    /* nid = */ NID_des_ede3_cbc,
-    /* block_size = */ 8,
-    /* key_len = */ 24,
-    /* iv_len = */ 8,
-    /* ctx_size = */ sizeof(DES_EDE_KEY),
-    /* flags = */ EVP_CIPH_CBC_MODE,
-    /* app_data = */ NULL,
-    /* init = */ des_ede3_init_key,
-    /* cipher = */ des_ede3_cbc_cipher,
-    /* cleanup = */ NULL,
-    /* ctrl = */ NULL,
+    .nid = NID_des_ede3_cbc,
+    .block_size = 8,
+    .key_len = 24,
+    .iv_len = 8,
+    .ctx_size = sizeof(DES_EDE_KEY),
+    .flags = EVP_CIPH_CBC_MODE,
+    .init = des_ede3_init_key,
+    .cipher = des_ede3_cbc_cipher,
 };
 
 const EVP_CIPHER *EVP_des_ede3_cbc(void) { return &evp_des_ede3_cbc; }
@@ -181,17 +172,14 @@ static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
 }
 
 static const EVP_CIPHER evp_des_ede_cbc = {
-    /* nid = */ NID_des_ede_cbc,
-    /* block_size = */ 8,
-    /* key_len = */ 16,
-    /* iv_len = */ 8,
-    /* ctx_size = */ sizeof(DES_EDE_KEY),
-    /* flags = */ EVP_CIPH_CBC_MODE,
-    /* app_data = */ NULL,
-    /* init = */ des_ede_init_key,
-    /* cipher = */ des_ede3_cbc_cipher,
-    /* cleanup = */ NULL,
-    /* ctrl = */ NULL,
+    .nid = NID_des_ede_cbc,
+    .block_size = 8,
+    .key_len = 16,
+    .iv_len = 8,
+    .ctx_size = sizeof(DES_EDE_KEY),
+    .flags = EVP_CIPH_CBC_MODE,
+    .init = des_ede_init_key,
+    .cipher = des_ede3_cbc_cipher,
 };
 
 const EVP_CIPHER *EVP_des_ede_cbc(void) { return &evp_des_ede_cbc; }
@@ -212,33 +200,27 @@ static int des_ede_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out,
 }
 
 static const EVP_CIPHER evp_des_ede = {
-    /* nid = */ NID_des_ede_ecb,
-    /* block_size = */ 8,
-    /* key_len = */ 16,
-    /* iv_len = */ 0,
-    /* ctx_size = */ sizeof(DES_EDE_KEY),
-    /* flags = */ EVP_CIPH_ECB_MODE,
-    /* app_data = */ NULL,
-    /* init = */ des_ede_init_key,
-    /* cipher = */ des_ede_ecb_cipher,
-    /* cleanup = */ NULL,
-    /* ctrl = */ NULL,
+    .nid = NID_des_ede_ecb,
+    .block_size = 8,
+    .key_len = 16,
+    .iv_len = 0,
+    .ctx_size = sizeof(DES_EDE_KEY),
+    .flags = EVP_CIPH_ECB_MODE,
+    .init = des_ede_init_key,
+    .cipher = des_ede_ecb_cipher,
 };
 
 const EVP_CIPHER *EVP_des_ede(void) { return &evp_des_ede; }
 
 static const EVP_CIPHER evp_des_ede3 = {
-    /* nid = */ NID_des_ede3_ecb,
-    /* block_size = */ 8,
-    /* key_len = */ 24,
-    /* iv_len = */ 0,
-    /* ctx_size = */ sizeof(DES_EDE_KEY),
-    /* flags = */ EVP_CIPH_ECB_MODE,
-    /* app_data = */ NULL,
-    /* init = */ des_ede3_init_key,
-    /* cipher = */ des_ede_ecb_cipher,
-    /* cleanup = */ NULL,
-    /* ctrl = */ NULL,
+    .nid = NID_des_ede3_ecb,
+    .block_size = 8,
+    .key_len = 24,
+    .iv_len = 0,
+    .ctx_size = sizeof(DES_EDE_KEY),
+    .flags = EVP_CIPH_ECB_MODE,
+    .init = des_ede3_init_key,
+    .cipher = des_ede_ecb_cipher,
 };
 
 const EVP_CIPHER *EVP_des_ede3(void) { return &evp_des_ede3; }
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_null.c b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_null.c
index 8245c0ea..15e2eb7b 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_null.c
+++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_null.c
@@ -78,9 +78,13 @@ static int null_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out,
 }
 
 static const EVP_CIPHER n_cipher = {
-    NID_undef,        1 /* block size */, 0 /* key_len */,     0 /* iv_len */,
-    0 /* ctx_size */, 0 /* flags */,      NULL /* app_data */, null_init_key,
-    null_cipher,      NULL /* cleanup */, NULL /* ctrl */,
+    .nid = NID_undef,
+    .block_size = 1,
+    .key_len = 0,
+    .iv_len = 0,
+    .ctx_size = 0,
+    .init = null_init_key,
+    .cipher = null_cipher,
 };
 
 const EVP_CIPHER *EVP_enc_null(void) { return &n_cipher; }
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc2.c b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc2.c
index eaf15062..695b6de0 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc2.c
+++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc2.c
@@ -427,37 +427,29 @@ static int rc2_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) {
 }
 
 static const EVP_CIPHER rc2_40_cbc = {
-    NID_rc2_40_cbc,
-    8 /* block size */,
-    5 /* 40 bit */,
-    8 /* iv len */,
-    sizeof(EVP_RC2_KEY),
-    EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT,
-    NULL /* app_data */,
-    rc2_init_key,
-    rc2_cbc_cipher,
-    NULL,
-    rc2_ctrl,
+    .nid = NID_rc2_40_cbc,
+    .block_size = 8,
+    .key_len = 5 /* 40 bit */,
+    .iv_len = 8,
+    .ctx_size = sizeof(EVP_RC2_KEY),
+    .flags = EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT,
+    .init = rc2_init_key,
+    .cipher = rc2_cbc_cipher,
+    .ctrl = rc2_ctrl,
 };
 
-const EVP_CIPHER *EVP_rc2_40_cbc(void) {
-  return &rc2_40_cbc;
-}
+const EVP_CIPHER *EVP_rc2_40_cbc(void) { return &rc2_40_cbc; }
 
 static const EVP_CIPHER rc2_cbc = {
-    NID_rc2_cbc,
-    8 /* block size */,
-    16 /* 128 bit */,
-    8 /* iv len */,
-    sizeof(EVP_RC2_KEY),
-    EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT,
-    NULL /* app_data */,
-    rc2_init_key,
-    rc2_cbc_cipher,
-    NULL,
-    rc2_ctrl,
+    .nid = NID_rc2_cbc,
+    .block_size = 8,
+    .key_len = 16 /* 128 bit */,
+    .iv_len = 8,
+    .ctx_size = sizeof(EVP_RC2_KEY),
+    .flags = EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT,
+    .init = rc2_init_key,
+    .cipher = rc2_cbc_cipher,
+    .ctrl = rc2_ctrl,
 };
 
-const EVP_CIPHER *EVP_rc2_cbc(void) {
-  return &rc2_cbc;
-}
+const EVP_CIPHER *EVP_rc2_cbc(void) { return &rc2_cbc; }
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc4.c b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc4.c
index b96269cd..395ffdf4 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc4.c
+++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/e_rc4.c
@@ -81,9 +81,14 @@ static int rc4_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
 }
 
 static const EVP_CIPHER rc4 = {
-    NID_rc4,             1 /* block_size */, 16 /* key_size */,
-    0 /* iv_len */,      sizeof(RC4_KEY),    EVP_CIPH_VARIABLE_LENGTH,
-    NULL /* app_data */, rc4_init_key,       rc4_cipher,
-    NULL /* cleanup */,  NULL /* ctrl */, };
+    .nid = NID_rc4,
+    .block_size = 1,
+    .key_len = 16,
+    .iv_len = 0,
+    .ctx_size = sizeof(RC4_KEY),
+    .flags = EVP_CIPH_VARIABLE_LENGTH,
+    .init = rc4_init_key,
+    .cipher = rc4_cipher,
+};
 
 const EVP_CIPHER *EVP_rc4(void) { return &rc4; }
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/internal.h b/Sources/CCryptoBoringSSL/crypto/cipher_extra/internal.h
index a6882d4d..ef944faf 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/internal.h
@@ -192,22 +192,65 @@ OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) {
 // Additional input parameters are passed in |aead_data->in|. On exit, it will
 // write calculated tag value to |aead_data->out.tag|, which the caller must
 // check.
+#if defined(OPENSSL_X86_64)
+extern void chacha20_poly1305_open_nohw(
+    uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len,
+    const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data);
+extern void chacha20_poly1305_open_avx2(
+    uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len,
+    const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data);
+OPENSSL_INLINE void chacha20_poly1305_open(uint8_t *out_plaintext,
+                                   const uint8_t *ciphertext,
+                                   size_t plaintext_len, const uint8_t *ad,
+                                   size_t ad_len,
+                                   union chacha20_poly1305_open_data *data) {
+  if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) {
+    chacha20_poly1305_open_avx2(out_plaintext, ciphertext, plaintext_len, ad,
+                                ad_len, data);
+  } else {
+    chacha20_poly1305_open_nohw(out_plaintext, ciphertext, plaintext_len, ad,
+                                ad_len, data);
+  }
+}
+#else
 extern void chacha20_poly1305_open(uint8_t *out_plaintext,
                                    const uint8_t *ciphertext,
                                    size_t plaintext_len, const uint8_t *ad,
                                    size_t ad_len,
                                    union chacha20_poly1305_open_data *data);
+#endif
 
 // chacha20_poly1305_open is defined in chacha20_poly1305_*.pl. It encrypts
 // |plaintext_len| bytes from |plaintext| and writes them to |out_ciphertext|.
 // Additional input parameters are passed in |aead_data->in|. The calculated tag
 // value is over the computed ciphertext concatenated with |extra_ciphertext|
 // and written to |aead_data->out.tag|.
+#if defined(OPENSSL_X86_64)
+extern void chacha20_poly1305_seal_nohw(
+    uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len,
+    const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data);
+extern void chacha20_poly1305_seal_avx2(
+    uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len,
+    const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data);
+OPENSSL_INLINE void chacha20_poly1305_seal(
+    uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len,
+    const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data) {
+  if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) {
+    chacha20_poly1305_seal_avx2(out_ciphertext, plaintext, plaintext_len, ad,
+                                ad_len, data);
+  } else {
+    chacha20_poly1305_seal_nohw(out_ciphertext, plaintext, plaintext_len, ad,
+                                ad_len, data);
+  }
+}
+#else
 extern void chacha20_poly1305_seal(uint8_t *out_ciphertext,
                                    const uint8_t *plaintext,
                                    size_t plaintext_len, const uint8_t *ad,
                                    size_t ad_len,
                                    union chacha20_poly1305_seal_data *data);
+#endif
+
 #else
 
 OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) { return 0; }
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/tls_cbc.c b/Sources/CCryptoBoringSSL/crypto/cipher_extra/tls_cbc.c
index 571b9040..87fd2e00 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/tls_cbc.c
+++ b/Sources/CCryptoBoringSSL/crypto/cipher_extra/tls_cbc.c
@@ -121,8 +121,8 @@ void EVP_tls_cbc_copy_mac(uint8_t *out, size_t md_size, const uint8_t *in,
   size_t mac_end = in_len;
   size_t mac_start = mac_end - md_size;
 
-  assert(orig_len >= in_len);
-  assert(in_len >= md_size);
+  declassify_assert(orig_len >= in_len);
+  declassify_assert(in_len >= md_size);
   assert(md_size <= EVP_MAX_MD_SIZE);
   assert(md_size > 0);
 
diff --git a/Sources/CCryptoBoringSSL/crypto/conf/conf.c b/Sources/CCryptoBoringSSL/crypto/conf/conf.c
index 4b1b6fbe..12e50f04 100644
--- a/Sources/CCryptoBoringSSL/crypto/conf/conf.c
+++ b/Sources/CCryptoBoringSSL/crypto/conf/conf.c
@@ -56,6 +56,7 @@
 
 #include <CCryptoBoringSSL_conf.h>
 
+#include <assert.h>
 #include <string.h>
 #include <ctype.h>
 
@@ -65,7 +66,6 @@
 #include <CCryptoBoringSSL_lhash.h>
 #include <CCryptoBoringSSL_mem.h>
 
-#include "conf_def.h"
 #include "internal.h"
 #include "../internal.h"
 
@@ -183,6 +183,26 @@ static CONF_SECTION *NCONF_new_section(const CONF *conf, const char *section) {
   return NULL;
 }
 
+static int is_comment(char c) { return c == '#'; }
+
+static int is_quote(char c) { return c == '"' || c == '\'' || c == '`'; }
+
+static int is_esc(char c) { return c == '\\'; }
+
+static int is_conf_ws(char c) {
+  // This differs from |OPENSSL_isspace| in that CONF does not accept '\v' and
+  // '\f' as whitespace.
+  return c == ' ' || c == '\t' || c == '\r' || c == '\n';
+}
+
+static int is_name_char(char c) {
+  // Alphanumeric characters, and a handful of symbols, may appear in value and
+  // section names without escaping.
+  return OPENSSL_isalnum(c) || c == '_' || c == '!' || c == '.' || c == '%' ||
+         c == '&' || c == '*' || c == '+' || c == ',' || c == '/' || c == ';' ||
+         c == '?' || c == '@' || c == '^' || c == '~' || c == '|' || c == '-';
+}
+
 static int str_copy(CONF *conf, char *section, char **pto, char *from) {
   int q, to = 0, len = 0;
   char v;
@@ -199,13 +219,13 @@ static int str_copy(CONF *conf, char *section, char **pto, char *from) {
   }
 
   for (;;) {
-    if (IS_QUOTE(conf, *from)) {
+    if (is_quote(*from)) {
       q = *from;
       from++;
-      while (!IS_EOF(conf, *from) && (*from != q)) {
-        if (IS_ESC(conf, *from)) {
+      while (*from != '\0' && *from != q) {
+        if (is_esc(*from)) {
           from++;
-          if (IS_EOF(conf, *from)) {
+          if (*from == '\0') {
             break;
           }
         }
@@ -214,10 +234,10 @@ static int str_copy(CONF *conf, char *section, char **pto, char *from) {
       if (*from == q) {
         from++;
       }
-    } else if (IS_ESC(conf, *from)) {
+    } else if (is_esc(*from)) {
       from++;
       v = *(from++);
-      if (IS_EOF(conf, v)) {
+      if (v == '\0') {
         break;
       } else if (v == 'r') {
         v = '\r';
@@ -229,11 +249,13 @@ static int str_copy(CONF *conf, char *section, char **pto, char *from) {
         v = '\t';
       }
       buf->data[to++] = v;
-    } else if (IS_EOF(conf, *from)) {
+    } else if (*from == '\0') {
       break;
     } else if (*from == '$') {
       // Historically, $foo would expand to a previously-parsed value. This
-      // feature has been removed as it was unused and is a DoS vector.
+      // feature has been removed as it was unused and is a DoS vector. If
+      // trying to embed '$' in a line, either escape it or wrap the value in
+      // quotes.
       OPENSSL_PUT_ERROR(CONF, CONF_R_VARIABLE_EXPANSION_NOT_SUPPORTED);
       goto err;
     } else {
@@ -312,36 +334,39 @@ static int add_string(const CONF *conf, CONF_SECTION *section,
   return 1;
 }
 
-static char *eat_ws(CONF *conf, char *p) {
-  while (IS_WS(conf, *p) && !IS_EOF(conf, *p)) {
+static char *eat_ws(char *p) {
+  while (*p != '\0' && is_conf_ws(*p)) {
     p++;
   }
   return p;
 }
 
-#define scan_esc(conf, p) (((IS_EOF((conf), (p)[1])) ? ((p) + 1) : ((p) + 2)))
+static char *scan_esc(char *p) {
+  assert(p[0] == '\\');
+  return p[1] == '\0' ? p + 1 : p + 2;
+}
 
-static char *eat_alpha_numeric(CONF *conf, char *p) {
+static char *eat_name(char *p) {
   for (;;) {
-    if (IS_ESC(conf, *p)) {
-      p = scan_esc(conf, p);
+    if (is_esc(*p)) {
+      p = scan_esc(p);
       continue;
     }
-    if (!IS_ALPHA_NUMERIC_PUNCT(conf, *p)) {
+    if (!is_name_char(*p)) {
       return p;
     }
     p++;
   }
 }
 
-static char *scan_quote(CONF *conf, char *p) {
+static char *scan_quote(char *p) {
   int q = *p;
 
   p++;
-  while (!IS_EOF(conf, *p) && *p != q) {
-    if (IS_ESC(conf, *p)) {
+  while (*p != '\0' && *p != q) {
+    if (is_esc(*p)) {
       p++;
-      if (IS_EOF(conf, *p)) {
+      if (*p == '\0') {
         return p;
       }
     }
@@ -353,28 +378,28 @@ static char *scan_quote(CONF *conf, char *p) {
   return p;
 }
 
-static void clear_comments(CONF *conf, char *p) {
+static void clear_comments(char *p) {
   for (;;) {
-    if (!IS_WS(conf, *p)) {
+    if (!is_conf_ws(*p)) {
       break;
     }
     p++;
   }
 
   for (;;) {
-    if (IS_COMMENT(conf, *p)) {
+    if (is_comment(*p)) {
       *p = '\0';
       return;
     }
-    if (IS_QUOTE(conf, *p)) {
-      p = scan_quote(conf, p);
+    if (is_quote(*p)) {
+      p = scan_quote(p);
       continue;
     }
-    if (IS_ESC(conf, *p)) {
-      p = scan_esc(conf, p);
+    if (is_esc(*p)) {
+      p = scan_esc(p);
       continue;
     }
-    if (IS_EOF(conf, *p)) {
+    if (*p == '\0') {
       return;
     } else {
       p++;
@@ -454,7 +479,7 @@ int NCONF_load_bio(CONF *conf, BIO *in, long *out_error_line) {
       // If we have bytes and the last char '\\' and
       // second last char is not '\\'
       p = &(buff->data[bufnum - 1]);
-      if (IS_ESC(conf, p[0]) && ((bufnum <= 1) || !IS_ESC(conf, p[-1]))) {
+      if (is_esc(p[0]) && ((bufnum <= 1) || !is_esc(p[-1]))) {
         bufnum--;
         again = 1;
       }
@@ -465,20 +490,20 @@ int NCONF_load_bio(CONF *conf, BIO *in, long *out_error_line) {
     bufnum = 0;
     buf = buff->data;
 
-    clear_comments(conf, buf);
-    s = eat_ws(conf, buf);
-    if (IS_EOF(conf, *s)) {
+    clear_comments(buf);
+    s = eat_ws(buf);
+    if (*s == '\0') {
       continue;  // blank line
     }
     if (*s == '[') {
       char *ss;
 
       s++;
-      start = eat_ws(conf, s);
+      start = eat_ws(s);
       ss = start;
     again:
-      end = eat_alpha_numeric(conf, ss);
-      p = eat_ws(conf, end);
+      end = eat_name(ss);
+      p = eat_ws(end);
       if (*p != ']') {
         if (*p != '\0' && ss != p) {
           ss = p;
@@ -502,27 +527,27 @@ int NCONF_load_bio(CONF *conf, BIO *in, long *out_error_line) {
     } else {
       pname = s;
       psection = NULL;
-      end = eat_alpha_numeric(conf, s);
+      end = eat_name(s);
       if ((end[0] == ':') && (end[1] == ':')) {
         *end = '\0';
         end += 2;
         psection = pname;
         pname = end;
-        end = eat_alpha_numeric(conf, end);
+        end = eat_name(end);
       }
-      p = eat_ws(conf, end);
+      p = eat_ws(end);
       if (*p != '=') {
         OPENSSL_PUT_ERROR(CONF, CONF_R_MISSING_EQUAL_SIGN);
         goto err;
       }
       *end = '\0';
       p++;
-      start = eat_ws(conf, p);
-      while (!IS_EOF(conf, *p)) {
+      start = eat_ws(p);
+      while (*p != '\0') {
         p++;
       }
       p--;
-      while ((p != start) && (IS_WS(conf, *p))) {
+      while (p != start && is_conf_ws(*p)) {
         p--;
       }
       p++;
diff --git a/Sources/CCryptoBoringSSL/crypto/conf/conf_def.h b/Sources/CCryptoBoringSSL/crypto/conf/conf_def.h
deleted file mode 100644
index d2c285ae..00000000
--- a/Sources/CCryptoBoringSSL/crypto/conf/conf_def.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- * 
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to.  The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- * 
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *    "This product includes cryptographic software written by
- *     Eric Young (eay@cryptsoft.com)"
- *    The word 'cryptographic' can be left out if the rouines from the library
- *    being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from 
- *    the apps directory (application code) you must include an acknowledgement:
- *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- * 
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * 
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed.  i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
-// This file was historically generated by keysets.pl in OpenSSL.
-//
-// TODO(davidben): Replace it with something more readable.
-
-#define CONF_NUMBER		1
-#define CONF_UPPER		2
-#define CONF_LOWER		4
-#define CONF_UNDER		256
-#define CONF_PUNCTUATION	512
-#define CONF_WS			16
-#define CONF_ESC		32
-#define CONF_QUOTE		64
-#define CONF_COMMENT		128
-#define CONF_EOF		8
-#define CONF_HIGHBIT		4096
-#define CONF_ALPHA		(CONF_UPPER|CONF_LOWER)
-#define CONF_ALPHA_NUMERIC	(CONF_ALPHA|CONF_NUMBER|CONF_UNDER)
-#define CONF_ALPHA_NUMERIC_PUNCT (CONF_ALPHA|CONF_NUMBER|CONF_UNDER| \
-					CONF_PUNCTUATION)
-
-#define KEYTYPES(c)		CONF_type_default
-#define IS_COMMENT(c,a)		(KEYTYPES(c)[(a)&0xff]&CONF_COMMENT)
-#define IS_EOF(c,a)		(KEYTYPES(c)[(a)&0xff]&CONF_EOF)
-#define IS_ESC(c,a)		(KEYTYPES(c)[(a)&0xff]&CONF_ESC)
-#define IS_NUMBER(c,a)		(KEYTYPES(c)[(a)&0xff]&CONF_NUMBER)
-#define IS_WS(c,a)		(KEYTYPES(c)[(a)&0xff]&CONF_WS)
-#define IS_ALPHA_NUMERIC(c,a)	(KEYTYPES(c)[(a)&0xff]&CONF_ALPHA_NUMERIC)
-#define IS_ALPHA_NUMERIC_PUNCT(c,a) \
-				(KEYTYPES(c)[(a)&0xff]&CONF_ALPHA_NUMERIC_PUNCT)
-#define IS_QUOTE(c,a)		(KEYTYPES(c)[(a)&0xff]&CONF_QUOTE)
-
-static const unsigned short CONF_type_default[256]={
-	0x0008,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
-	0x0000,0x0010,0x0010,0x0000,0x0000,0x0010,0x0000,0x0000,
-	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
-	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
-	0x0010,0x0200,0x0040,0x0080,0x0000,0x0200,0x0200,0x0040,
-	0x0000,0x0000,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200,
-	0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,
-	0x0001,0x0001,0x0000,0x0200,0x0000,0x0000,0x0000,0x0200,
-	0x0200,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,
-	0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,
-	0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,
-	0x0002,0x0002,0x0002,0x0000,0x0020,0x0000,0x0200,0x0100,
-	0x0040,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,
-	0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,
-	0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,
-	0x0004,0x0004,0x0004,0x0000,0x0200,0x0000,0x0200,0x0000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,
-	};
diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_apple.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_apple.c
index b478e41b..6e90c4c3 100644
--- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_apple.c
+++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_apple.c
@@ -15,7 +15,7 @@
 #include "internal.h"
 
 #if defined(OPENSSL_AARCH64) && defined(OPENSSL_APPLE) && \
-    !defined(OPENSSL_STATIC_ARMCAP)
+    !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM)
 
 #include <sys/sysctl.h>
 #include <sys/types.h>
diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_fuchsia.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_fuchsia.c
index fd36f097..36709b06 100644
--- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_fuchsia.c
+++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_fuchsia.c
@@ -15,7 +15,7 @@
 #include "internal.h"
 
 #if defined(OPENSSL_AARCH64) && defined(OPENSSL_FUCHSIA) && \
-    !defined(OPENSSL_STATIC_ARMCAP)
+    !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM)
 
 #include <zircon/features.h>
 #include <zircon/syscalls.h>
diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_linux.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_linux.c
index 43e418b3..388a032b 100644
--- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_linux.c
+++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_linux.c
@@ -15,7 +15,7 @@
 #include "internal.h"
 
 #if defined(OPENSSL_AARCH64) && defined(OPENSSL_LINUX) && \
-    !defined(OPENSSL_STATIC_ARMCAP)
+    !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM)
 
 #include <sys/auxv.h>
 
diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_openbsd.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_openbsd.c
index 09b19738..e44a66af 100644
--- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_openbsd.c
+++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_openbsd.c
@@ -15,11 +15,11 @@
 #include <CCryptoBoringSSL_cpu.h>
 
 #if defined(OPENSSL_AARCH64) && defined(OPENSSL_OPENBSD) && \
-    !defined(OPENSSL_STATIC_ARMCAP)
+    !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM)
 
-#include <sys/sysctl.h>
-#include <machine/cpu.h>
 #include <machine/armreg.h>
+#include <machine/cpu.h>
+#include <sys/sysctl.h>
 
 #include <CCryptoBoringSSL_arm_arch.h>
 
@@ -27,7 +27,7 @@
 
 
 void OPENSSL_cpuid_setup(void) {
-  int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+  int isar0_mib[] = {CTL_MACHDEP, CPU_ID_AA64ISAR0};
   uint64_t cpu_id = 0;
   size_t len = sizeof(cpu_id);
 
diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_sysreg.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_sysreg.c
index 92c533a5..4d04f778 100644
--- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_sysreg.c
+++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_sysreg.c
@@ -18,7 +18,8 @@
 // expects userspace to simply read them. It traps the reads and fills in CPU
 // capabilities.
 #if defined(OPENSSL_AARCH64) && !defined(OPENSSL_STATIC_ARMCAP) && \
-    (defined(ANDROID_BAREMETAL) || defined(OPENSSL_FREEBSD))
+    (defined(ANDROID_BAREMETAL) || defined(OPENSSL_FREEBSD)) &&    \
+    !defined(OPENSSL_NO_ASM)
 
 #include <CCryptoBoringSSL_arm_arch.h>
 
diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_win.c b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_win.c
index 60e29451..3d9fc0f1 100644
--- a/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_win.c
+++ b/Sources/CCryptoBoringSSL/crypto/cpu_aarch64_win.c
@@ -16,7 +16,7 @@
 #include "internal.h"
 
 #if defined(OPENSSL_AARCH64) && defined(OPENSSL_WINDOWS) && \
-    !defined(OPENSSL_STATIC_ARMCAP)
+    !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM)
 
 #include <windows.h>
 
diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_arm_linux.c b/Sources/CCryptoBoringSSL/crypto/cpu_arm_linux.c
index 44f5f2ad..2921df86 100644
--- a/Sources/CCryptoBoringSSL/crypto/cpu_arm_linux.c
+++ b/Sources/CCryptoBoringSSL/crypto/cpu_arm_linux.c
@@ -143,6 +143,9 @@ void OPENSSL_cpuid_setup(void) {
 
 int CRYPTO_has_broken_NEON(void) { return 0; }
 
-int CRYPTO_needs_hwcap2_workaround(void) { return g_needs_hwcap2_workaround; }
+int CRYPTO_needs_hwcap2_workaround(void) {
+  OPENSSL_init_cpuid();
+  return g_needs_hwcap2_workaround;
+}
 
 #endif  // OPENSSL_ARM && OPENSSL_LINUX && !OPENSSL_STATIC_ARMCAP
diff --git a/Sources/CCryptoBoringSSL/crypto/cpu_intel.c b/Sources/CCryptoBoringSSL/crypto/cpu_intel.c
index b34c483c..d5c7b8ba 100644
--- a/Sources/CCryptoBoringSSL/crypto/cpu_intel.c
+++ b/Sources/CCryptoBoringSSL/crypto/cpu_intel.c
@@ -173,20 +173,21 @@ void OPENSSL_cpuid_setup(void) {
 
   OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1);
 
-  if (is_amd) {
-    // See https://www.amd.com/system/files/TechDocs/25481.pdf, page 10.
-    const uint32_t base_family = (eax >> 8) & 15;
-    const uint32_t base_model = (eax >> 4) & 15;
-
-    uint32_t family = base_family;
-    uint32_t model = base_model;
-    if (base_family == 0xf) {
-      const uint32_t ext_family = (eax >> 20) & 255;
-      family += ext_family;
-      const uint32_t ext_model = (eax >> 16) & 15;
-      model |= ext_model << 4;
-    }
+  const uint32_t base_family = (eax >> 8) & 15;
+  const uint32_t base_model = (eax >> 4) & 15;
+
+  uint32_t family = base_family;
+  uint32_t model = base_model;
+  if (base_family == 15) {
+    const uint32_t ext_family = (eax >> 20) & 255;
+    family += ext_family;
+  }
+  if (base_family == 6 || base_family == 15) {
+    const uint32_t ext_model = (eax >> 16) & 15;
+    model |= ext_model << 4;
+  }
 
+  if (is_amd) {
     if (family < 0x17 || (family == 0x17 && 0x70 <= model && model <= 0x7f)) {
       // Disable RDRAND on AMD families before 0x17 (Zen) due to reported
       // failures after suspend.
@@ -208,15 +209,6 @@ void OPENSSL_cpuid_setup(void) {
   // Reserved bit #30 is repurposed to signal an Intel CPU.
   if (is_intel) {
     edx |= (1u << 30);
-
-    // Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables
-    // some Silvermont-specific codepaths which perform better. See OpenSSL
-    // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f and
-    // |CRYPTO_cpu_perf_is_like_silvermont|.
-    if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ ||
-        (eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) {
-      ecx &= ~(1u << 26);
-    }
   } else {
     edx &= ~(1u << 30);
   }
@@ -236,25 +228,67 @@ void OPENSSL_cpuid_setup(void) {
     ecx &= ~(1u << 28);  // AVX
     ecx &= ~(1u << 12);  // FMA
     ecx &= ~(1u << 11);  // AMD XOP
-    // Clear AVX2 and AVX512* bits.
-    //
-    // TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream
-    // doesn't clear those. See the comments in
-    // |CRYPTO_hardware_supports_XSAVE|.
-    extended_features[0] &=
-        ~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31));
+    extended_features[0] &= ~(1u << 5);   // AVX2
+    extended_features[1] &= ~(1u << 9);   // VAES
+    extended_features[1] &= ~(1u << 10);  // VPCLMULQDQ
   }
-  // See Intel manual, volume 1, section 15.2.
+  // See Intel manual, volume 1, sections 15.2 ("Detection of AVX-512 Foundation
+  // Instructions") through 15.4 ("Detection of Intel AVX-512 Instruction Groups
+  // Operating at 256 and 128-bit Vector Lengths").
   if ((xcr0 & 0xe6) != 0xe6) {
-    // Clear AVX512F. Note we don't touch other AVX512 extensions because they
-    // can be used with YMM.
-    extended_features[0] &= ~(1u << 16);
+    // Without XCR0.111xx11x, no AVX512 feature can be used. This includes ZMM
+    // registers, masking, SIMD registers 16-31 (even if accessed as YMM or
+    // XMM), and EVEX-coded instructions (even on YMM or XMM). Even if only
+    // XCR0.ZMM_Hi256 is missing, it isn't valid to use AVX512 features on
+    // shorter vectors, since AVX512 ties everything to the availability of
+    // 512-bit vectors. See the above-mentioned sections of the Intel manual,
+    // which say that *all* these XCR0 bits must be checked even when just using
+    // 128-bit or 256-bit vectors, and also volume 2a section 2.7.11 ("#UD
+    // Equations for EVEX") which says that all EVEX-coded instructions raise an
+    // undefined-instruction exception if any of these XCR0 bits is zero.
+    //
+    // AVX10 fixes this by reorganizing the features that used to be part of
+    // "AVX512" and allowing them to be used independently of 512-bit support.
+    // TODO: add AVX10 detection.
+    extended_features[0] &= ~(1u << 16);  // AVX512F
+    extended_features[0] &= ~(1u << 17);  // AVX512DQ
+    extended_features[0] &= ~(1u << 21);  // AVX512IFMA
+    extended_features[0] &= ~(1u << 26);  // AVX512PF
+    extended_features[0] &= ~(1u << 27);  // AVX512ER
+    extended_features[0] &= ~(1u << 28);  // AVX512CD
+    extended_features[0] &= ~(1u << 30);  // AVX512BW
+    extended_features[0] &= ~(1u << 31);  // AVX512VL
+    extended_features[1] &= ~(1u << 1);   // AVX512VBMI
+    extended_features[1] &= ~(1u << 6);   // AVX512VBMI2
+    extended_features[1] &= ~(1u << 11);  // AVX512VNNI
+    extended_features[1] &= ~(1u << 12);  // AVX512BITALG
+    extended_features[1] &= ~(1u << 14);  // AVX512VPOPCNTDQ
   }
 
-  // Disable ADX instructions on Knights Landing. See OpenSSL commit
-  // 64d92d74985ebb3d0be58a9718f9e080a14a8e7f.
-  if ((ecx & (1u << 26)) == 0) {
-    extended_features[0] &= ~(1u << 19);
+  // Repurpose the bit for the removed MPX feature to indicate when using zmm
+  // registers should be avoided even when they are supported. (When set, AVX512
+  // features can still be used, but only using ymm or xmm registers.) Skylake
+  // suffered from severe downclocking when zmm registers were used, which
+  // affected unrelated code running on the system, making zmm registers not too
+  // useful outside of benchmarks. The situation improved significantly by Ice
+  // Lake, but a small amount of downclocking remained. (See
+  // https://lore.kernel.org/linux-crypto/e8ce1146-3952-6977-1d0e-a22758e58914@intel.com/)
+  // We take a conservative approach of not allowing zmm registers until after
+  // Ice Lake and Tiger Lake, i.e. until Sapphire Rapids on the server side.
+  //
+  // AMD CPUs, which support AVX512 starting with Zen 4, have not been reported
+  // to have any downclocking problem when zmm registers are used.
+  if (is_intel && family == 6 &&
+      (model == 85 ||    // Skylake, Cascade Lake, Cooper Lake (server)
+       model == 106 ||   // Ice Lake (server)
+       model == 108 ||   // Ice Lake (micro server)
+       model == 125 ||   // Ice Lake (client)
+       model == 126 ||   // Ice Lake (mobile)
+       model == 140 ||   // Tiger Lake (mobile)
+       model == 141)) {  // Tiger Lake (client)
+    extended_features[0] |= 1u << 14;
+  } else {
+    extended_features[0] &= ~(1u << 14);
   }
 
   OPENSSL_ia32cap_P[0] = edx;
diff --git a/Sources/CCryptoBoringSSL/crypto/crypto.c b/Sources/CCryptoBoringSSL/crypto/crypto.c
index ac75d723..9a4f440e 100644
--- a/Sources/CCryptoBoringSSL/crypto/crypto.c
+++ b/Sources/CCryptoBoringSSL/crypto/crypto.c
@@ -16,31 +16,14 @@
 
 #include <assert.h>
 
-#include "fipsmodule/rand/fork_detect.h"
 #include "fipsmodule/rand/internal.h"
+#include "bcm_support.h"
 #include "internal.h"
 
 
 static_assert(sizeof(ossl_ssize_t) == sizeof(size_t),
               "ossl_ssize_t should be the same size as size_t");
 
-#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_STATIC_ARMCAP) && \
-    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) ||            \
-     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
-// x86, x86_64, and the ARMs need to record the result of a cpuid/getauxval call
-// for the asm to work correctly, unless compiled without asm code.
-#define NEED_CPUID
-
-#else
-
-// Otherwise, don't emit a static initialiser.
-
-#if !defined(BORINGSSL_NO_STATIC_INITIALIZER)
-#define BORINGSSL_NO_STATIC_INITIALIZER
-#endif
-
-#endif  // !NO_ASM && !STATIC_ARMCAP && (X86 || X86_64 || ARM || AARCH64)
-
 
 // Our assembly does not use the GOT to reference symbols, which means
 // references to visible symbols will often require a TEXTREL. This is
@@ -79,7 +62,7 @@ HIDDEN uint8_t BORINGSSL_function_hit[7] = {0};
 HIDDEN uint32_t OPENSSL_ia32cap_P[4] = {0};
 
 uint32_t OPENSSL_get_ia32cap(int idx) {
-  CRYPTO_library_init();
+  OPENSSL_init_cpuid();
   return OPENSSL_ia32cap_P[idx];
 }
 
@@ -121,60 +104,24 @@ HIDDEN uint32_t OPENSSL_armcap_P =
 HIDDEN uint32_t OPENSSL_armcap_P = 0;
 
 uint32_t *OPENSSL_get_armcap_pointer_for_test(void) {
-  CRYPTO_library_init();
+  OPENSSL_init_cpuid();
   return &OPENSSL_armcap_P;
 }
 #endif
 
 uint32_t OPENSSL_get_armcap(void) {
-  CRYPTO_library_init();
+  OPENSSL_init_cpuid();
   return OPENSSL_armcap_P;
 }
 
 #endif
 
-#if defined(BORINGSSL_FIPS)
-// In FIPS mode, the power-on self-test function calls |CRYPTO_library_init|
-// because we have to ensure that CPUID detection occurs first.
-#define BORINGSSL_NO_STATIC_INITIALIZER
-#endif
-
-#if defined(OPENSSL_WINDOWS) && !defined(BORINGSSL_NO_STATIC_INITIALIZER)
-#define OPENSSL_CDECL __cdecl
-#else
-#define OPENSSL_CDECL
-#endif
-
-#if defined(BORINGSSL_NO_STATIC_INITIALIZER)
-static CRYPTO_once_t once = CRYPTO_ONCE_INIT;
-#elif defined(_MSC_VER)
-#pragma section(".CRT$XCU", read)
-static void __cdecl do_library_init(void);
-__declspec(allocate(".CRT$XCU")) void(*library_init_constructor)(void) =
-    do_library_init;
-#else
-static void do_library_init(void) __attribute__ ((constructor));
-#endif
-
-// do_library_init is the actual initialization function. If
-// BORINGSSL_NO_STATIC_INITIALIZER isn't defined, this is set as a static
-// initializer. Otherwise, it is called by CRYPTO_library_init.
-static void OPENSSL_CDECL do_library_init(void) {
- // WARNING: this function may only configure the capability variables. See the
- // note above about the linker bug.
 #if defined(NEED_CPUID)
-  OPENSSL_cpuid_setup();
+static CRYPTO_once_t once = CRYPTO_ONCE_INIT;
+void OPENSSL_init_cpuid(void) { CRYPTO_once(&once, OPENSSL_cpuid_setup); }
 #endif
-}
 
-void CRYPTO_library_init(void) {
-  // TODO(davidben): It would be tidier if this build knob could be replaced
-  // with an internal lazy-init mechanism that would handle things correctly
-  // in-library. https://crbug.com/542879
-#if defined(BORINGSSL_NO_STATIC_INITIALIZER)
-  CRYPTO_once(&once, do_library_init);
-#endif
-}
+void CRYPTO_library_init(void) {}
 
 int CRYPTO_is_confidential_build(void) {
 #if defined(BORINGSSL_CONFIDENTIAL)
@@ -194,7 +141,7 @@ int CRYPTO_has_asm(void) {
 
 void CRYPTO_pre_sandbox_init(void) {
   // Read from /proc/cpuinfo if needed.
-  CRYPTO_library_init();
+  OPENSSL_init_cpuid();
   // Open /dev/urandom if needed.
   CRYPTO_init_sysrand();
   // Set up MADV_WIPEONFORK state if needed.
@@ -235,7 +182,6 @@ int ENGINE_register_all_complete(void) { return 1; }
 void OPENSSL_load_builtin_modules(void) {}
 
 int OPENSSL_init_crypto(uint64_t opts, const OPENSSL_INIT_SETTINGS *settings) {
-  CRYPTO_library_init();
   return 1;
 }
 
diff --git a/Sources/CCryptoBoringSSL/crypto/curve25519/curve25519.c b/Sources/CCryptoBoringSSL/crypto/curve25519/curve25519.c
index 03894479..4cf870a5 100644
--- a/Sources/CCryptoBoringSSL/crypto/curve25519/curve25519.c
+++ b/Sources/CCryptoBoringSSL/crypto/curve25519/curve25519.c
@@ -81,7 +81,7 @@ typedef uint64_t fe_limb_t;
 #define assert_fe(f)                                                    \
   do {                                                                  \
     for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
-      assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc));             \
+      declassify_assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc));  \
     }                                                                   \
   } while (0)
 
@@ -98,7 +98,7 @@ typedef uint64_t fe_limb_t;
 #define assert_fe_loose(f)                                              \
   do {                                                                  \
     for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
-      assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664));            \
+      declassify_assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \
     }                                                                   \
   } while (0)
 
@@ -120,8 +120,8 @@ typedef uint32_t fe_limb_t;
 #define assert_fe(f)                                                     \
   do {                                                                   \
     for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
-      assert(f[_assert_fe_i] <=                                          \
-             ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u));            \
+      declassify_assert(f[_assert_fe_i] <=                               \
+                        ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \
     }                                                                    \
   } while (0)
 
@@ -138,8 +138,8 @@ typedef uint32_t fe_limb_t;
 #define assert_fe_loose(f)                                               \
   do {                                                                   \
     for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
-      assert(f[_assert_fe_i] <=                                          \
-             ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u));            \
+      declassify_assert(f[_assert_fe_i] <=                               \
+                        ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \
     }                                                                    \
   } while (0)
 
@@ -150,7 +150,7 @@ static_assert(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
 
 static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
   // |fiat_25519_from_bytes| requires the top-most bit be clear.
-  assert((s[31] & 0x80) == 0);
+  declassify_assert((s[31] & 0x80) == 0);
   fiat_25519_from_bytes(h->v, s);
   assert_fe(h->v);
 }
diff --git a/Sources/CCryptoBoringSSL/crypto/digest_extra/digest_extra.c b/Sources/CCryptoBoringSSL/crypto/digest_extra/digest_extra.c
index 6bc495d2..c6a9d443 100644
--- a/Sources/CCryptoBoringSSL/crypto/digest_extra/digest_extra.c
+++ b/Sources/CCryptoBoringSSL/crypto/digest_extra/digest_extra.c
@@ -61,6 +61,8 @@
 #include <CCryptoBoringSSL_blake2.h>
 #include <CCryptoBoringSSL_bytestring.h>
 #include <CCryptoBoringSSL_obj.h>
+#include <CCryptoBoringSSL_md4.h>
+#include <CCryptoBoringSSL_md5.h>
 #include <CCryptoBoringSSL_nid.h>
 
 #include "../asn1/internal.h"
@@ -220,6 +222,7 @@ int EVP_marshal_digest_algorithm(CBB *cbb, const EVP_MD *md) {
     return 0;
   }
 
+  // TODO(crbug.com/boringssl/710): Is this correct? See RFC 4055, section 2.1.
   if (!CBB_add_asn1(&algorithm, &null, CBS_ASN1_NULL) ||
       !CBB_flush(cbb)) {
     return 0;
@@ -263,3 +266,90 @@ static const EVP_MD evp_md_blake2b256 = {
 };
 
 const EVP_MD *EVP_blake2b256(void) { return &evp_md_blake2b256; }
+
+
+static void md4_init(EVP_MD_CTX *ctx) {
+  BSSL_CHECK(MD4_Init(ctx->md_data));
+}
+
+static void md4_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
+  BSSL_CHECK(MD4_Update(ctx->md_data, data, count));
+}
+
+static void md4_final(EVP_MD_CTX *ctx, uint8_t *out) {
+  BSSL_CHECK(MD4_Final(out, ctx->md_data));
+}
+
+static const EVP_MD evp_md_md4 = {
+  NID_md4,
+  MD4_DIGEST_LENGTH,
+  0,
+  md4_init,
+  md4_update,
+  md4_final,
+  64,
+  sizeof(MD4_CTX),
+};
+
+const EVP_MD *EVP_md4(void) { return &evp_md_md4; }
+
+static void md5_init(EVP_MD_CTX *ctx) {
+  BSSL_CHECK(MD5_Init(ctx->md_data));
+}
+
+static void md5_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
+  BSSL_CHECK(MD5_Update(ctx->md_data, data, count));
+}
+
+static void md5_final(EVP_MD_CTX *ctx, uint8_t *out) {
+  BSSL_CHECK(MD5_Final(out, ctx->md_data));
+}
+
+static const EVP_MD evp_md_md5 = {
+  NID_md5,
+  MD5_DIGEST_LENGTH,
+  0,
+  md5_init,
+  md5_update,
+  md5_final,
+  64,
+  sizeof(MD5_CTX),
+};
+
+const EVP_MD *EVP_md5(void) { return &evp_md_md5; }
+
+typedef struct {
+  MD5_CTX md5;
+  SHA_CTX sha1;
+} MD5_SHA1_CTX;
+
+static void md5_sha1_init(EVP_MD_CTX *md_ctx) {
+  MD5_SHA1_CTX *ctx = md_ctx->md_data;
+  BSSL_CHECK(MD5_Init(&ctx->md5) && SHA1_Init(&ctx->sha1));
+}
+
+static void md5_sha1_update(EVP_MD_CTX *md_ctx, const void *data,
+                            size_t count) {
+  MD5_SHA1_CTX *ctx = md_ctx->md_data;
+  BSSL_CHECK(MD5_Update(&ctx->md5, data, count) &&
+        SHA1_Update(&ctx->sha1, data, count));
+}
+
+static void md5_sha1_final(EVP_MD_CTX *md_ctx, uint8_t *out) {
+  MD5_SHA1_CTX *ctx = md_ctx->md_data;
+  BSSL_CHECK(MD5_Final(out, &ctx->md5) &&
+        SHA1_Final(out + MD5_DIGEST_LENGTH, &ctx->sha1));
+}
+
+const EVP_MD evp_md_md5_sha1 = {
+  NID_md5_sha1,
+  MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH,
+  0,
+  md5_sha1_init,
+  md5_sha1_update,
+  md5_sha1_final,
+  64,
+  sizeof(MD5_SHA1_CTX),
+};
+
+const EVP_MD *EVP_md5_sha1(void) { return &evp_md_md5_sha1; }
diff --git a/Sources/CCryptoBoringSSL/crypto/dilithium/dilithium.c b/Sources/CCryptoBoringSSL/crypto/dilithium/dilithium.c
new file mode 100644
index 00000000..770ed765
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/dilithium/dilithium.c
@@ -0,0 +1,1539 @@
+/* Copyright (c) 2023, Google LLC
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#define OPENSSL_UNSTABLE_EXPERIMENTAL_DILITHIUM
+#include <experimental/CCryptoBoringSSL_dilithium.h>
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include <CCryptoBoringSSL_bytestring.h>
+#include <CCryptoBoringSSL_rand.h>
+
+#include "../internal.h"
+#include "../keccak/internal.h"
+#include "./internal.h"
+
+#define DEGREE 256
+#define K 6
+#define L 5
+#define ETA 4
+#define TAU 49
+#define BETA 196
+#define OMEGA 55
+
+#define RHO_BYTES 32
+#define SIGMA_BYTES 64
+#define K_BYTES 32
+#define TR_BYTES 64
+#define MU_BYTES 64
+#define RHO_PRIME_BYTES 64
+#define LAMBDA_BITS 192
+#define LAMBDA_BYTES (LAMBDA_BITS / 8)
+
+// 2^23 - 2^13 + 1
+static const uint32_t kPrime = 8380417;
+// Inverse of -kPrime modulo 2^32
+static const uint32_t kPrimeNegInverse = 4236238847;
+static const int kDroppedBits = 13;
+static const uint32_t kHalfPrime = (8380417 - 1) / 2;
+static const uint32_t kGamma1 = 1 << 19;
+static const uint32_t kGamma2 = (8380417 - 1) / 32;
+// 256^-1 mod kPrime, in Montgomery form.
+static const uint32_t kInverseDegreeMontgomery = 41978;
+
+typedef struct scalar {
+  uint32_t c[DEGREE];
+} scalar;
+
+typedef struct vectork {
+  scalar v[K];
+} vectork;
+
+typedef struct vectorl {
+  scalar v[L];
+} vectorl;
+
+typedef struct matrix {
+  scalar v[K][L];
+} matrix;
+
+/* Arithmetic */
+
+// This bit of Python will be referenced in some of the following comments:
+//
+// q = 8380417
+// # Inverse of -q modulo 2^32
+// q_neg_inverse = 4236238847
+// # 2^64 modulo q
+// montgomery_square = 2365951
+//
+// def bitreverse(i):
+//     ret = 0
+//     for n in range(8):
+//         bit = i & 1
+//         ret <<= 1
+//         ret |= bit
+//         i >>= 1
+//     return ret
+//
+// def montgomery_reduce(x):
+//     a = (x * q_neg_inverse) % 2**32
+//     b = x + a * q
+//     assert b & 0xFFFF_FFFF == 0
+//     c = b >> 32
+//     assert c < q
+//     return c
+//
+// def montgomery_transform(x):
+//     return montgomery_reduce(x * montgomery_square)
+
+// kNTTRootsMontgomery = [
+//   montgomery_transform(pow(1753, bitreverse(i), q)) for i in range(256)
+// ]
+static const uint32_t kNTTRootsMontgomery[256] = {
+    4193792, 25847,   5771523, 7861508, 237124,  7602457, 7504169, 466468,
+    1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103,
+    2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868,
+    6262231, 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005,
+    2706023, 95776,   3077325, 3530437, 6718724, 4788269, 5842901, 3915439,
+    4519302, 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118,
+    6681150, 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596,
+    811944,  531354,  954230,  3881043, 3900724, 5823537, 2071892, 5582638,
+    4450022, 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196,
+    7122806, 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922,
+    3412210, 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370,
+    7709315, 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987,
+    5037034, 264944,  508951,  3097992, 44288,   7280319, 904516,  3958618,
+    4656075, 8371839, 1653064, 5130689, 2389356, 8169440, 759969,  7063561,
+    189548,  4827145, 3159746, 6529015, 5971092, 8202977, 1315589, 1341330,
+    1285669, 6795489, 7567685, 6940675, 5361315, 4499357, 4751448, 3839961,
+    2091667, 3407706, 2316500, 3817976, 5037939, 2244091, 5933984, 4817955,
+    266997,  2434439, 7144689, 3513181, 4860065, 4621053, 7183191, 5187039,
+    900702,  1859098, 909542,  819034,  495491,  6767243, 8337157, 7857917,
+    7725090, 5257975, 2031748, 3207046, 4823422, 7855319, 7611795, 4784579,
+    342297,  286988,  5942594, 4108315, 3437287, 5038140, 1735879, 203044,
+    2842341, 2691481, 5790267, 1265009, 4055324, 1247620, 2486353, 1595974,
+    4613401, 1250494, 2635921, 4832145, 5386378, 1869119, 1903435, 7329447,
+    7047359, 1237275, 5062207, 6950192, 7929317, 1312455, 3306115, 6417775,
+    7100756, 1917081, 5834105, 7005614, 1500165, 777191,  2235880, 3406031,
+    7838005, 5548557, 6709241, 6533464, 5796124, 4656147, 594136,  4603424,
+    6366809, 2432395, 2454455, 8215696, 1957272, 3369112, 185531,  7173032,
+    5196991, 162844,  1616392, 3014001, 810149,  1652634, 4686184, 6581310,
+    5341501, 3523897, 3866901, 269760,  2213111, 7404533, 1717735, 472078,
+    7953734, 1723600, 6577327, 1910376, 6712985, 7276084, 8119771, 4546524,
+    5441381, 6144432, 7959518, 6094090, 183443,  7403526, 1612842, 4834730,
+    7826001, 3919660, 8332111, 7018208, 3937738, 1400424, 7534263, 1976782};
+
+// Reduces x mod kPrime in constant time, where 0 <= x < 2*kPrime.
+static uint32_t reduce_once(uint32_t x) {
+  declassify_assert(x < 2 * kPrime);
+  // return x < kPrime ? x : x - kPrime;
+  return constant_time_select_int(constant_time_lt_w(x, kPrime), x, x - kPrime);
+}
+
+// Returns the absolute value in constant time.
+static uint32_t abs_signed(uint32_t x) {
+  // return is_positive(x) ? x : -x;
+  // Note: MSVC doesn't like applying the unary minus operator to unsigned types
+  // (warning C4146), so we write the negation as a bitwise not plus one
+  // (assuming two's complement representation).
+  return constant_time_select_int(constant_time_lt_w(x, 0x80000000), x, ~x + 1);
+}
+
+// Returns the absolute value modulo kPrime.
+static uint32_t abs_mod_prime(uint32_t x) {
+  declassify_assert(x < kPrime);
+  // return x > kHalfPrime ? kPrime - x : x;
+  return constant_time_select_int(constant_time_lt_w(kHalfPrime, x), kPrime - x,
+                                  x);
+}
+
+// Returns the maximum of two values in constant time.
+static uint32_t maximum(uint32_t x, uint32_t y) {
+  // return x < y ? y : x;
+  return constant_time_select_int(constant_time_lt_w(x, y), y, x);
+}
+
+static void scalar_add(scalar *out, const scalar *lhs, const scalar *rhs) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = reduce_once(lhs->c[i] + rhs->c[i]);
+  }
+}
+
+static void scalar_sub(scalar *out, const scalar *lhs, const scalar *rhs) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = reduce_once(kPrime + lhs->c[i] - rhs->c[i]);
+  }
+}
+
+static uint32_t reduce_montgomery(uint64_t x) {
+  uint64_t a = (uint32_t)x * kPrimeNegInverse;
+  uint64_t b = x + a * kPrime;
+  declassify_assert((b & 0xffffffff) == 0);
+  uint32_t c = b >> 32;
+  return reduce_once(c);
+}
+
+// Multiply two scalars in the number theoretically transformed state.
+static void scalar_mult(scalar *out, const scalar *lhs, const scalar *rhs) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = reduce_montgomery((uint64_t)lhs->c[i] * (uint64_t)rhs->c[i]);
+  }
+}
+
+// In place number theoretic transform of a given scalar.
+//
+// FIPS 204, Algorithm 35 (`NTT`).
+static void scalar_ntt(scalar *s) {
+  // Step: 1, 2, 4, 8, ..., 128
+  // Offset: 128, 64, 32, 16, ..., 1
+  int offset = DEGREE;
+  for (int step = 1; step < DEGREE; step <<= 1) {
+    offset >>= 1;
+    int k = 0;
+    for (int i = 0; i < step; i++) {
+      assert(k == 2 * offset * i);
+      const uint32_t step_root = kNTTRootsMontgomery[step + i];
+      for (int j = k; j < k + offset; j++) {
+        uint32_t even = s->c[j];
+        uint32_t odd =
+            reduce_montgomery((uint64_t)step_root * (uint64_t)s->c[j + offset]);
+        s->c[j] = reduce_once(odd + even);
+        s->c[j + offset] = reduce_once(kPrime + even - odd);
+      }
+      k += 2 * offset;
+    }
+  }
+}
+
+// In place inverse number theoretic transform of a given scalar.
+//
+// FIPS 204, Algorithm 36 (`NTT^-1`).
+static void scalar_inverse_ntt(scalar *s) {
+  // Step: 128, 64, 32, 16, ..., 1
+  // Offset: 1, 2, 4, 8, ..., 128
+  int step = DEGREE;
+  for (int offset = 1; offset < DEGREE; offset <<= 1) {
+    step >>= 1;
+    int k = 0;
+    for (int i = 0; i < step; i++) {
+      assert(k == 2 * offset * i);
+      const uint32_t step_root =
+          kPrime - kNTTRootsMontgomery[step + (step - 1 - i)];
+      for (int j = k; j < k + offset; j++) {
+        uint32_t even = s->c[j];
+        uint32_t odd = s->c[j + offset];
+        s->c[j] = reduce_once(odd + even);
+        s->c[j + offset] = reduce_montgomery((uint64_t)step_root *
+                                             (uint64_t)(kPrime + even - odd));
+      }
+      k += 2 * offset;
+    }
+  }
+  for (int i = 0; i < DEGREE; i++) {
+    s->c[i] = reduce_montgomery((uint64_t)s->c[i] *
+                                (uint64_t)kInverseDegreeMontgomery);
+  }
+}
+
+static void vectork_zero(vectork *out) { OPENSSL_memset(out, 0, sizeof(*out)); }
+
+static void vectork_add(vectork *out, const vectork *lhs, const vectork *rhs) {
+  for (int i = 0; i < K; i++) {
+    scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]);
+  }
+}
+
+static void vectork_sub(vectork *out, const vectork *lhs, const vectork *rhs) {
+  for (int i = 0; i < K; i++) {
+    scalar_sub(&out->v[i], &lhs->v[i], &rhs->v[i]);
+  }
+}
+
+static void vectork_mult_scalar(vectork *out, const vectork *lhs,
+                                const scalar *rhs) {
+  for (int i = 0; i < K; i++) {
+    scalar_mult(&out->v[i], &lhs->v[i], rhs);
+  }
+}
+
+static void vectork_ntt(vectork *a) {
+  for (int i = 0; i < K; i++) {
+    scalar_ntt(&a->v[i]);
+  }
+}
+
+static void vectork_inverse_ntt(vectork *a) {
+  for (int i = 0; i < K; i++) {
+    scalar_inverse_ntt(&a->v[i]);
+  }
+}
+
+static void vectorl_add(vectorl *out, const vectorl *lhs, const vectorl *rhs) {
+  for (int i = 0; i < L; i++) {
+    scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]);
+  }
+}
+
+static void vectorl_mult_scalar(vectorl *out, const vectorl *lhs,
+                                const scalar *rhs) {
+  for (int i = 0; i < L; i++) {
+    scalar_mult(&out->v[i], &lhs->v[i], rhs);
+  }
+}
+
+static void vectorl_ntt(vectorl *a) {
+  for (int i = 0; i < L; i++) {
+    scalar_ntt(&a->v[i]);
+  }
+}
+
+static void vectorl_inverse_ntt(vectorl *a) {
+  for (int i = 0; i < L; i++) {
+    scalar_inverse_ntt(&a->v[i]);
+  }
+}
+
+static void matrix_mult(vectork *out, const matrix *m, const vectorl *a) {
+  vectork_zero(out);
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < L; j++) {
+      scalar product;
+      scalar_mult(&product, &m->v[i][j], &a->v[j]);
+      scalar_add(&out->v[i], &out->v[i], &product);
+    }
+  }
+}
+
+/* Rounding & hints */
+
+// FIPS 204, Algorithm 29 (`Power2Round`).
+static void power2_round(uint32_t *r1, uint32_t *r0, uint32_t r) {
+  *r1 = r >> kDroppedBits;
+  *r0 = r - (*r1 << kDroppedBits);
+
+  uint32_t r0_adjusted = reduce_once(kPrime + *r0 - (1 << kDroppedBits));
+  uint32_t r1_adjusted = *r1 + 1;
+
+  // Mask is set iff r0 > 2^(dropped_bits - 1).
+  crypto_word_t mask =
+      constant_time_lt_w((uint32_t)(1 << (kDroppedBits - 1)), *r0);
+  // r0 = mask ? r0_adjusted : r0
+  *r0 = constant_time_select_int(mask, r0_adjusted, *r0);
+  // r1 = mask ? r1_adjusted : r1
+  *r1 = constant_time_select_int(mask, r1_adjusted, *r1);
+}
+
+// Scale back previously rounded value.
+static void scale_power2_round(uint32_t *out, uint32_t r1) {
+  // Pre-condition: 0 <= r1 <= 2^10 - 1
+  *out = r1 << kDroppedBits;
+  // Post-condition: 0 <= out <= 2^23 - 2^13 = kPrime - 1
+  assert(*out < kPrime);
+}
+
+// FIPS 204, Algorithm 31 (`HighBits`).
+static uint32_t high_bits(uint32_t x) {
+  // Reference description (given 0 <= x < q):
+  //
+  // ```
+  // int32_t r0 = x mod+- (2 * kGamma2);
+  // if (x - r0 == q - 1) {
+  //   return 0;
+  // } else {
+  //   return (x - r0) / (2 * kGamma2);
+  // }
+  // ```
+  //
+  // Below is the formula taken from the reference implementation.
+  //
+  // Here, kGamma2 == 2^18 - 2^8
+  // This returns ((ceil(x / 2^7) * (2^10 + 1) + 2^21) / 2^22) mod 2^4
+  uint32_t r1 = (x + 127) >> 7;
+  r1 = (r1 * 1025 + (1 << 21)) >> 22;
+  r1 &= 15;
+  return r1;
+}
+
+// FIPS 204, Algorithm 30 (`Decompose`).
+static void decompose(uint32_t *r1, int32_t *r0, uint32_t r) {
+  *r1 = high_bits(r);
+
+  *r0 = r;
+  *r0 -= *r1 * 2 * (int32_t)kGamma2;
+  *r0 -= (((int32_t)kHalfPrime - *r0) >> 31) & (int32_t)kPrime;
+}
+
+// FIPS 204, Algorithm 32 (`LowBits`).
+static int32_t low_bits(uint32_t x) {
+  uint32_t r1;
+  int32_t r0;
+  decompose(&r1, &r0, x);
+  return r0;
+}
+
+// FIPS 204, Algorithm 33 (`MakeHint`).
+static int32_t make_hint(uint32_t ct0, uint32_t cs2, uint32_t w) {
+  uint32_t r_plus_z = reduce_once(kPrime + w - cs2);
+  uint32_t r = reduce_once(r_plus_z + ct0);
+  return high_bits(r) != high_bits(r_plus_z);
+}
+
+// FIPS 204, Algorithm 34 (`UseHint`).
+static uint32_t use_hint_vartime(uint32_t h, uint32_t r) {
+  uint32_t r1;
+  int32_t r0;
+  decompose(&r1, &r0, r);
+
+  if (h) {
+    if (r0 > 0) {
+      return (r1 + 1) & 15;
+    } else {
+      return (r1 - 1) & 15;
+    }
+  } else {
+    return r1;
+  }
+}
+
+static void scalar_power2_round(scalar *s1, scalar *s0, const scalar *s) {
+  for (int i = 0; i < DEGREE; i++) {
+    power2_round(&s1->c[i], &s0->c[i], s->c[i]);
+  }
+}
+
+static void scalar_scale_power2_round(scalar *out, const scalar *in) {
+  for (int i = 0; i < DEGREE; i++) {
+    scale_power2_round(&out->c[i], in->c[i]);
+  }
+}
+
+static void scalar_high_bits(scalar *out, const scalar *in) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = high_bits(in->c[i]);
+  }
+}
+
+static void scalar_low_bits(scalar *out, const scalar *in) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = low_bits(in->c[i]);
+  }
+}
+
+static void scalar_max(uint32_t *max, const scalar *s) {
+  for (int i = 0; i < DEGREE; i++) {
+    uint32_t abs = abs_mod_prime(s->c[i]);
+    *max = maximum(*max, abs);
+  }
+}
+
+static void scalar_max_signed(uint32_t *max, const scalar *s) {
+  for (int i = 0; i < DEGREE; i++) {
+    uint32_t abs = abs_signed(s->c[i]);
+    *max = maximum(*max, abs);
+  }
+}
+
+static void scalar_make_hint(scalar *out, const scalar *ct0, const scalar *cs2,
+                             const scalar *w) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = make_hint(ct0->c[i], cs2->c[i], w->c[i]);
+  }
+}
+
+static void scalar_use_hint_vartime(scalar *out, const scalar *h,
+                                    const scalar *r) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = use_hint_vartime(h->c[i], r->c[i]);
+  }
+}
+
+static void vectork_power2_round(vectork *t1, vectork *t0, const vectork *t) {
+  for (int i = 0; i < K; i++) {
+    scalar_power2_round(&t1->v[i], &t0->v[i], &t->v[i]);
+  }
+}
+
+static void vectork_scale_power2_round(vectork *out, const vectork *in) {
+  for (int i = 0; i < K; i++) {
+    scalar_scale_power2_round(&out->v[i], &in->v[i]);
+  }
+}
+
+static void vectork_high_bits(vectork *out, const vectork *in) {
+  for (int i = 0; i < K; i++) {
+    scalar_high_bits(&out->v[i], &in->v[i]);
+  }
+}
+
+static void vectork_low_bits(vectork *out, const vectork *in) {
+  for (int i = 0; i < K; i++) {
+    scalar_low_bits(&out->v[i], &in->v[i]);
+  }
+}
+
+static uint32_t vectork_max(const vectork *a) {
+  uint32_t max = 0;
+  for (int i = 0; i < K; i++) {
+    scalar_max(&max, &a->v[i]);
+  }
+  return max;
+}
+
+static uint32_t vectork_max_signed(const vectork *a) {
+  uint32_t max = 0;
+  for (int i = 0; i < K; i++) {
+    scalar_max_signed(&max, &a->v[i]);
+  }
+  return max;
+}
+
+// The input vector contains only zeroes and ones.
+static size_t vectork_count_ones(const vectork *a) {
+  size_t count = 0;
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < DEGREE; j++) {
+      count += a->v[i].c[j];
+    }
+  }
+  return count;
+}
+
+static void vectork_make_hint(vectork *out, const vectork *ct0,
+                              const vectork *cs2, const vectork *w) {
+  for (int i = 0; i < K; i++) {
+    scalar_make_hint(&out->v[i], &ct0->v[i], &cs2->v[i], &w->v[i]);
+  }
+}
+
+static void vectork_use_hint_vartime(vectork *out, const vectork *h,
+                                     const vectork *r) {
+  for (int i = 0; i < K; i++) {
+    scalar_use_hint_vartime(&out->v[i], &h->v[i], &r->v[i]);
+  }
+}
+
+static uint32_t vectorl_max(const vectorl *a) {
+  uint32_t max = 0;
+  for (int i = 0; i < L; i++) {
+    scalar_max(&max, &a->v[i]);
+  }
+  return max;
+}
+
+/* Bit packing */
+
+static const uint8_t kMasks[8] = {0x01, 0x03, 0x07, 0x0f,
+                                  0x1f, 0x3f, 0x7f, 0xff};
+
+// FIPS 204, Algorithm 10 (`SimpleBitPack`).
+static void scalar_encode(uint8_t *out, const scalar *s, int bits) {
+  assert(bits <= (int)sizeof(*s->c) * 8 && bits != 1);
+
+  uint8_t out_byte = 0;
+  int out_byte_bits = 0;
+
+  for (int i = 0; i < DEGREE; i++) {
+    uint32_t element = s->c[i];
+    int element_bits_done = 0;
+
+    while (element_bits_done < bits) {
+      int chunk_bits = bits - element_bits_done;
+      int out_bits_remaining = 8 - out_byte_bits;
+      if (chunk_bits >= out_bits_remaining) {
+        chunk_bits = out_bits_remaining;
+        out_byte |= (element & kMasks[chunk_bits - 1]) << out_byte_bits;
+        *out = out_byte;
+        out++;
+        out_byte_bits = 0;
+        out_byte = 0;
+      } else {
+        out_byte |= (element & kMasks[chunk_bits - 1]) << out_byte_bits;
+        out_byte_bits += chunk_bits;
+      }
+
+      element_bits_done += chunk_bits;
+      element >>= chunk_bits;
+    }
+  }
+
+  if (out_byte_bits > 0) {
+    *out = out_byte;
+  }
+}
+
+// FIPS 204, Algorithm 11 (`BitPack`).
+static void scalar_encode_signed(uint8_t *out, const scalar *s, int bits,
+                                 uint32_t max) {
+  assert(bits <= (int)sizeof(*s->c) * 8 && bits != 1);
+
+  uint8_t out_byte = 0;
+  int out_byte_bits = 0;
+
+  for (int i = 0; i < DEGREE; i++) {
+    uint32_t element = reduce_once(kPrime + max - s->c[i]);
+    declassify_assert(element <= 2 * max);
+    int element_bits_done = 0;
+
+    while (element_bits_done < bits) {
+      int chunk_bits = bits - element_bits_done;
+      int out_bits_remaining = 8 - out_byte_bits;
+      if (chunk_bits >= out_bits_remaining) {
+        chunk_bits = out_bits_remaining;
+        out_byte |= (element & kMasks[chunk_bits - 1]) << out_byte_bits;
+        *out = out_byte;
+        out++;
+        out_byte_bits = 0;
+        out_byte = 0;
+      } else {
+        out_byte |= (element & kMasks[chunk_bits - 1]) << out_byte_bits;
+        out_byte_bits += chunk_bits;
+      }
+
+      element_bits_done += chunk_bits;
+      element >>= chunk_bits;
+    }
+  }
+
+  if (out_byte_bits > 0) {
+    *out = out_byte;
+  }
+}
+
+// FIPS 204, Algorithm 12 (`SimpleBitUnpack`).
+static void scalar_decode(scalar *out, const uint8_t *in, int bits) {
+  assert(bits <= (int)sizeof(*out->c) * 8 && bits != 1);
+
+  uint8_t in_byte = 0;
+  int in_byte_bits_left = 0;
+
+  for (int i = 0; i < DEGREE; i++) {
+    uint32_t element = 0;
+    int element_bits_done = 0;
+
+    while (element_bits_done < bits) {
+      if (in_byte_bits_left == 0) {
+        in_byte = *in;
+        in++;
+        in_byte_bits_left = 8;
+      }
+
+      int chunk_bits = bits - element_bits_done;
+      if (chunk_bits > in_byte_bits_left) {
+        chunk_bits = in_byte_bits_left;
+      }
+
+      element |= (in_byte & kMasks[chunk_bits - 1]) << element_bits_done;
+      in_byte_bits_left -= chunk_bits;
+      in_byte >>= chunk_bits;
+
+      element_bits_done += chunk_bits;
+    }
+
+    out->c[i] = element;
+  }
+}
+
+// FIPS 204, Algorithm 13 (`BitUnpack`).
+static int scalar_decode_signed(scalar *out, const uint8_t *in, int bits,
+                                uint32_t max) {
+  assert(bits <= (int)sizeof(*out->c) * 8 && bits != 1);
+
+  uint8_t in_byte = 0;
+  int in_byte_bits_left = 0;
+
+  for (int i = 0; i < DEGREE; i++) {
+    uint32_t element = 0;
+    int element_bits_done = 0;
+
+    while (element_bits_done < bits) {
+      if (in_byte_bits_left == 0) {
+        in_byte = *in;
+        in++;
+        in_byte_bits_left = 8;
+      }
+
+      int chunk_bits = bits - element_bits_done;
+      if (chunk_bits > in_byte_bits_left) {
+        chunk_bits = in_byte_bits_left;
+      }
+
+      element |= (in_byte & kMasks[chunk_bits - 1]) << element_bits_done;
+      in_byte_bits_left -= chunk_bits;
+      in_byte >>= chunk_bits;
+
+      element_bits_done += chunk_bits;
+    }
+
+    // This may be only out of range in cases of invalid input, in which case it
+    // is okay to leak the value. This function is also called with secret
+    // input during signing, in |scalar_sample_mask|. However, in that case
+    // (and in any case when |max| is a power of two), this case is impossible.
+    if (constant_time_declassify_int(element > 2 * max)) {
+      return 0;
+    }
+    out->c[i] = reduce_once(kPrime + max - element);
+  }
+
+  return 1;
+}
+
+/* Expansion functions */
+
+// FIPS 204, Algorithm 24 (`RejNTTPoly`).
+//
+// Rejection samples a Keccak stream to get uniformly distributed elements. This
+// is used for matrix expansion and only operates on public inputs.
+static void scalar_from_keccak_vartime(
+    scalar *out, const uint8_t derived_seed[RHO_BYTES + 2]) {
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake128);
+  BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, RHO_BYTES + 2);
+  assert(keccak_ctx.squeeze_offset == 0);
+  assert(keccak_ctx.rate_bytes == 168);
+  static_assert(168 % 3 == 0, "block and coefficient boundaries do not align");
+
+  int done = 0;
+  while (done < DEGREE) {
+    uint8_t block[168];
+    BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+    for (size_t i = 0; i < sizeof(block) && done < DEGREE; i += 3) {
+      // FIPS 204, Algorithm 8 (`CoeffFromThreeBytes`).
+      uint32_t value = (uint32_t)block[i] | ((uint32_t)block[i + 1] << 8) |
+                       (((uint32_t)block[i + 2] & 0x7f) << 16);
+      if (value < kPrime) {
+        out->c[done++] = value;
+      }
+    }
+  }
+}
+
+// FIPS 204, Algorithm 25 (`RejBoundedPoly`).
+static void scalar_uniform_eta_4(scalar *out,
+                                 const uint8_t derived_seed[SIGMA_BYTES + 2]) {
+  static_assert(ETA == 4, "This implementation is specialized for ETA == 4");
+
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, SIGMA_BYTES + 2);
+  assert(keccak_ctx.squeeze_offset == 0);
+  assert(keccak_ctx.rate_bytes == 136);
+
+  int done = 0;
+  while (done < DEGREE) {
+    uint8_t block[136];
+    BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+    for (size_t i = 0; i < sizeof(block) && done < DEGREE; ++i) {
+      uint32_t t0 = block[i] & 0x0F;
+      uint32_t t1 = block[i] >> 4;
+      // FIPS 204, Algorithm 9 (`CoefFromHalfByte`). Although both the input and
+      // output here are secret, it is OK to leak when we rejected a byte.
+      // Individual bytes of the SHAKE-256 stream are (indistiguishable from)
+      // independent of each other and the original seed, so leaking information
+      // about the rejected bytes does not reveal the input or output.
+      if (constant_time_declassify_int(t0 < 9)) {
+        out->c[done++] = reduce_once(kPrime + ETA - t0);
+      }
+      if (done < DEGREE && constant_time_declassify_int(t1 < 9)) {
+        out->c[done++] = reduce_once(kPrime + ETA - t1);
+      }
+    }
+  }
+}
+
+// FIPS 204, Algorithm 28 (`ExpandMask`).
+static void scalar_sample_mask(
+    scalar *out, const uint8_t derived_seed[RHO_PRIME_BYTES + 2]) {
+  uint8_t buf[640];
+  BORINGSSL_keccak(buf, sizeof(buf), derived_seed, RHO_PRIME_BYTES + 2,
+                   boringssl_shake256);
+
+  // Note: Decoding 20 bits into (-2^19, 2^19] cannot fail.
+  scalar_decode_signed(out, buf, 20, 1 << 19);
+}
+
+// FIPS 204, Algorithm 23 (`SampleInBall`).
+static void scalar_sample_in_ball_vartime(scalar *out, const uint8_t *seed,
+                                          int len) {
+  assert(len == 32);
+
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, seed, len);
+  assert(keccak_ctx.squeeze_offset == 0);
+  assert(keccak_ctx.rate_bytes == 136);
+
+  uint8_t block[136];
+  BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+
+  uint64_t signs = CRYPTO_load_u64_le(block);
+  int offset = 8;
+  // SampleInBall implements a Fisher–Yates shuffle, which unavoidably leaks
+  // where the zeros are by memory access pattern. Although this leak happens
+  // before bad signatures are rejected, this is safe. See
+  // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/8d8f01ac_70af3f21/
+  CONSTTIME_DECLASSIFY(block + offset, sizeof(block) - offset);
+
+  OPENSSL_memset(out, 0, sizeof(*out));
+  for (size_t i = DEGREE - TAU; i < DEGREE; i++) {
+    size_t byte;
+    for (;;) {
+      if (offset == 136) {
+        BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+        // See above.
+        CONSTTIME_DECLASSIFY(block, sizeof(block));
+        offset = 0;
+      }
+
+      byte = block[offset++];
+      if (byte <= i) {
+        break;
+      }
+    }
+
+    out->c[i] = out->c[byte];
+    out->c[byte] = reduce_once(kPrime + 1 - 2 * (signs & 1));
+    signs >>= 1;
+  }
+}
+
+// FIPS 204, Algorithm 26 (`ExpandA`).
+static void matrix_expand(matrix *out, const uint8_t rho[RHO_BYTES]) {
+  static_assert(K <= 0x100, "K must fit in 8 bits");
+  static_assert(L <= 0x100, "L must fit in 8 bits");
+
+  uint8_t derived_seed[RHO_BYTES + 2];
+  OPENSSL_memcpy(derived_seed, rho, RHO_BYTES);
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < L; j++) {
+      derived_seed[RHO_BYTES + 1] = i;
+      derived_seed[RHO_BYTES] = j;
+      scalar_from_keccak_vartime(&out->v[i][j], derived_seed);
+    }
+  }
+}
+
+// FIPS 204, Algorithm 27 (`ExpandS`).
+static void vector_expand_short(vectorl *s1, vectork *s2,
+                                const uint8_t sigma[SIGMA_BYTES]) {
+  static_assert(K <= 0x100, "K must fit in 8 bits");
+  static_assert(L <= 0x100, "L must fit in 8 bits");
+  static_assert(K + L <= 0x100, "K+L must fit in 8 bits");
+
+  uint8_t derived_seed[SIGMA_BYTES + 2];
+  OPENSSL_memcpy(derived_seed, sigma, SIGMA_BYTES);
+  derived_seed[SIGMA_BYTES] = 0;
+  derived_seed[SIGMA_BYTES + 1] = 0;
+  for (int i = 0; i < L; i++) {
+    scalar_uniform_eta_4(&s1->v[i], derived_seed);
+    ++derived_seed[SIGMA_BYTES];
+  }
+  for (int i = 0; i < K; i++) {
+    scalar_uniform_eta_4(&s2->v[i], derived_seed);
+    ++derived_seed[SIGMA_BYTES];
+  }
+}
+
+// FIPS 204, Algorithm 28 (`ExpandMask`).
+static void vectorl_expand_mask(vectorl *out,
+                                const uint8_t seed[RHO_PRIME_BYTES],
+                                size_t kappa) {
+  assert(kappa + L <= 0x10000);
+
+  uint8_t derived_seed[RHO_PRIME_BYTES + 2];
+  OPENSSL_memcpy(derived_seed, seed, RHO_PRIME_BYTES);
+  for (int i = 0; i < L; i++) {
+    size_t index = kappa + i;
+    derived_seed[RHO_PRIME_BYTES] = index & 0xFF;
+    derived_seed[RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF;
+    scalar_sample_mask(&out->v[i], derived_seed);
+  }
+}
+
+/* Encoding */
+
+// FIPS 204, Algorithm 10 (`SimpleBitPack`).
+//
+// Encodes an entire vector into 32*K*|bits| bytes. Note that since 256 (DEGREE)
+// is divisible by 8, the individual vector entries will always fill a whole
+// number of bytes, so we do not need to worry about bit packing here.
+static void vectork_encode(uint8_t *out, const vectork *a, int bits) {
+  for (int i = 0; i < K; i++) {
+    scalar_encode(out + i * bits * DEGREE / 8, &a->v[i], bits);
+  }
+}
+
+// FIPS 204, Algorithm 12 (`SimpleBitUnpack`).
+static void vectork_decode(vectork *out, const uint8_t *in, int bits) {
+  for (int i = 0; i < K; i++) {
+    scalar_decode(&out->v[i], in + i * bits * DEGREE / 8, bits);
+  }
+}
+
+static void vectork_encode_signed(uint8_t *out, const vectork *a, int bits,
+                                  uint32_t max) {
+  for (int i = 0; i < K; i++) {
+    scalar_encode_signed(out + i * bits * DEGREE / 8, &a->v[i], bits, max);
+  }
+}
+
+static int vectork_decode_signed(vectork *out, const uint8_t *in, int bits,
+                                 uint32_t max) {
+  for (int i = 0; i < K; i++) {
+    if (!scalar_decode_signed(&out->v[i], in + i * bits * DEGREE / 8, bits,
+                              max)) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+// FIPS 204, Algorithm 11 (`BitPack`).
+//
+// Encodes an entire vector into 32*L*|bits| bytes. Note that since 256 (DEGREE)
+// is divisible by 8, the individual vector entries will always fill a whole
+// number of bytes, so we do not need to worry about bit packing here.
+static void vectorl_encode_signed(uint8_t *out, const vectorl *a, int bits,
+                                  uint32_t max) {
+  for (int i = 0; i < L; i++) {
+    scalar_encode_signed(out + i * bits * DEGREE / 8, &a->v[i], bits, max);
+  }
+}
+
+static int vectorl_decode_signed(vectorl *out, const uint8_t *in, int bits,
+                                 uint32_t max) {
+  for (int i = 0; i < L; i++) {
+    if (!scalar_decode_signed(&out->v[i], in + i * bits * DEGREE / 8, bits,
+                              max)) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+// FIPS 204, Algorithm 22 (`w1Encode`).
+//
+// The output must point to an array of 128*K bytes.
+static void w1_encode(uint8_t *out, const vectork *w1) {
+  vectork_encode(out, w1, 4);
+}
+
+// FIPS 204, Algorithm 14 (`HintBitPack`).
+static void hint_bit_pack(uint8_t *out, const vectork *h) {
+  OPENSSL_memset(out, 0, OMEGA + K);
+  int index = 0;
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < DEGREE; j++) {
+      if (h->v[i].c[j]) {
+        out[index++] = j;
+      }
+    }
+    out[OMEGA + i] = index;
+  }
+}
+
+// FIPS 204, Algorithm 15 (`HintBitUnpack`).
+static int hint_bit_unpack(vectork *h, const uint8_t *in) {
+  vectork_zero(h);
+  int index = 0;
+  for (int i = 0; i < K; i++) {
+    int limit = in[OMEGA + i];
+    if (limit < index || limit > OMEGA) {
+      return 0;
+    }
+
+    int last = -1;
+    while (index < limit) {
+      int byte = in[index++];
+      if (last >= 0 && byte <= last) {
+        return 0;
+      }
+      last = byte;
+      h->v[i].c[byte] = 1;
+    }
+  }
+  for (; index < OMEGA; index++) {
+    if (in[index] != 0) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+struct public_key {
+  uint8_t rho[RHO_BYTES];
+  vectork t1;
+  // Pre-cached value(s).
+  uint8_t public_key_hash[TR_BYTES];
+};
+
+struct private_key {
+  uint8_t rho[RHO_BYTES];
+  uint8_t k[K_BYTES];
+  uint8_t public_key_hash[TR_BYTES];
+  vectorl s1;
+  vectork s2;
+  vectork t0;
+};
+
+struct signature {
+  uint8_t c_tilde[2 * LAMBDA_BYTES];
+  vectorl z;
+  vectork h;
+};
+
+// FIPS 204, Algorithm 16 (`pkEncode`).
+static int dilithium_marshal_public_key(CBB *out,
+                                        const struct public_key *pub) {
+  if (!CBB_add_bytes(out, pub->rho, sizeof(pub->rho))) {
+    return 0;
+  }
+
+  uint8_t *vectork_output;
+  if (!CBB_add_space(out, &vectork_output, 320 * K)) {
+    return 0;
+  }
+  vectork_encode(vectork_output, &pub->t1, 10);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 17 (`pkDecode`).
+static int dilithium_parse_public_key(struct public_key *pub, CBS *in) {
+  if (!CBS_copy_bytes(in, pub->rho, sizeof(pub->rho))) {
+    return 0;
+  }
+
+  CBS t1_bytes;
+  if (!CBS_get_bytes(in, &t1_bytes, 320 * K)) {
+    return 0;
+  }
+  vectork_decode(&pub->t1, CBS_data(&t1_bytes), 10);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 18 (`skEncode`).
+static int dilithium_marshal_private_key(CBB *out,
+                                         const struct private_key *priv) {
+  if (!CBB_add_bytes(out, priv->rho, sizeof(priv->rho)) ||
+      !CBB_add_bytes(out, priv->k, sizeof(priv->k)) ||
+      !CBB_add_bytes(out, priv->public_key_hash,
+                     sizeof(priv->public_key_hash))) {
+    return 0;
+  }
+
+  uint8_t *vectorl_output;
+  if (!CBB_add_space(out, &vectorl_output, 128 * L)) {
+    return 0;
+  }
+  vectorl_encode_signed(vectorl_output, &priv->s1, 4, ETA);
+
+  uint8_t *vectork_output;
+  if (!CBB_add_space(out, &vectork_output, 128 * K)) {
+    return 0;
+  }
+  vectork_encode_signed(vectork_output, &priv->s2, 4, ETA);
+
+  if (!CBB_add_space(out, &vectork_output, 416 * K)) {
+    return 0;
+  }
+  vectork_encode_signed(vectork_output, &priv->t0, 13, 1 << 12);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 19 (`skDecode`).
+static int dilithium_parse_private_key(struct private_key *priv, CBS *in) {
+  CBS s1_bytes;
+  CBS s2_bytes;
+  CBS t0_bytes;
+  if (!CBS_copy_bytes(in, priv->rho, sizeof(priv->rho)) ||
+      !CBS_copy_bytes(in, priv->k, sizeof(priv->k)) ||
+      !CBS_copy_bytes(in, priv->public_key_hash,
+                      sizeof(priv->public_key_hash)) ||
+      !CBS_get_bytes(in, &s1_bytes, 128 * L) ||
+      !vectorl_decode_signed(&priv->s1, CBS_data(&s1_bytes), 4, ETA) ||
+      !CBS_get_bytes(in, &s2_bytes, 128 * K) ||
+      !vectork_decode_signed(&priv->s2, CBS_data(&s2_bytes), 4, ETA) ||
+      !CBS_get_bytes(in, &t0_bytes, 416 * K) ||
+      // Note: Decoding 13 bits into (-2^12, 2^12] cannot fail.
+      !vectork_decode_signed(&priv->t0, CBS_data(&t0_bytes), 13, 1 << 12)) {
+    return 0;
+  }
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 20 (`sigEncode`).
+static int dilithium_marshal_signature(CBB *out, const struct signature *sign) {
+  if (!CBB_add_bytes(out, sign->c_tilde, sizeof(sign->c_tilde))) {
+    return 0;
+  }
+
+  uint8_t *vectorl_output;
+  if (!CBB_add_space(out, &vectorl_output, 640 * L)) {
+    return 0;
+  }
+  vectorl_encode_signed(vectorl_output, &sign->z, 20, 1 << 19);
+
+  uint8_t *hint_output;
+  if (!CBB_add_space(out, &hint_output, OMEGA + K)) {
+    return 0;
+  }
+  hint_bit_pack(hint_output, &sign->h);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 21 (`sigDecode`).
+static int dilithium_parse_signature(struct signature *sign, CBS *in) {
+  CBS z_bytes;
+  CBS hint_bytes;
+  if (!CBS_copy_bytes(in, sign->c_tilde, sizeof(sign->c_tilde)) ||
+      !CBS_get_bytes(in, &z_bytes, 640 * L) ||
+      // Note: Decoding 20 bits into (-2^19, 2^19] cannot fail.
+      !vectorl_decode_signed(&sign->z, CBS_data(&z_bytes), 20, 1 << 19) ||
+      !CBS_get_bytes(in, &hint_bytes, OMEGA + K) ||
+      !hint_bit_unpack(&sign->h, CBS_data(&hint_bytes))) {
+    return 0;
+  };
+
+  return 1;
+}
+
+static struct private_key *private_key_from_external(
+    const struct DILITHIUM_private_key *external) {
+  static_assert(
+      sizeof(struct DILITHIUM_private_key) == sizeof(struct private_key),
+      "Kyber private key size incorrect");
+  static_assert(
+      alignof(struct DILITHIUM_private_key) == alignof(struct private_key),
+      "Kyber private key align incorrect");
+  return (struct private_key *)external;
+}
+
+static struct public_key *public_key_from_external(
+    const struct DILITHIUM_public_key *external) {
+  static_assert(
+      sizeof(struct DILITHIUM_public_key) == sizeof(struct public_key),
+      "Dilithium public key size incorrect");
+  static_assert(
+      alignof(struct DILITHIUM_public_key) == alignof(struct public_key),
+      "Dilithium public key align incorrect");
+  return (struct public_key *)external;
+}
+
+/* API */
+
+// Calls |DILITHIUM_generate_key_external_entropy| with random bytes from
+// |RAND_bytes|. Returns 1 on success and 0 on failure.
+int DILITHIUM_generate_key(
+    uint8_t out_encoded_public_key[DILITHIUM_PUBLIC_KEY_BYTES],
+    struct DILITHIUM_private_key *out_private_key) {
+  uint8_t entropy[DILITHIUM_GENERATE_KEY_ENTROPY];
+  RAND_bytes(entropy, sizeof(entropy));
+  return DILITHIUM_generate_key_external_entropy(out_encoded_public_key,
+                                                 out_private_key, entropy);
+}
+
+// FIPS 204, Algorithm 1 (`ML-DSA.KeyGen`). Returns 1 on success and 0 on
+// failure.
+int DILITHIUM_generate_key_external_entropy(
+    uint8_t out_encoded_public_key[DILITHIUM_PUBLIC_KEY_BYTES],
+    struct DILITHIUM_private_key *out_private_key,
+    const uint8_t entropy[DILITHIUM_GENERATE_KEY_ENTROPY]) {
+  int ret = 0;
+
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    struct public_key pub;
+    matrix a_ntt;
+    vectorl s1_ntt;
+    vectork t;
+  };
+  struct values_st *values = OPENSSL_malloc(sizeof(*values));
+  if (values == NULL) {
+    goto err;
+  }
+
+  struct private_key *priv = private_key_from_external(out_private_key);
+
+  uint8_t expanded_seed[RHO_BYTES + SIGMA_BYTES + K_BYTES];
+  BORINGSSL_keccak(expanded_seed, sizeof(expanded_seed), entropy,
+                   DILITHIUM_GENERATE_KEY_ENTROPY, boringssl_shake256);
+  const uint8_t *const rho = expanded_seed;
+  const uint8_t *const sigma = expanded_seed + RHO_BYTES;
+  const uint8_t *const k = expanded_seed + RHO_BYTES + SIGMA_BYTES;
+  // rho is public.
+  CONSTTIME_DECLASSIFY(rho, RHO_BYTES);
+  OPENSSL_memcpy(values->pub.rho, rho, sizeof(values->pub.rho));
+  OPENSSL_memcpy(priv->rho, rho, sizeof(priv->rho));
+  OPENSSL_memcpy(priv->k, k, sizeof(priv->k));
+
+  matrix_expand(&values->a_ntt, rho);
+  vector_expand_short(&priv->s1, &priv->s2, sigma);
+
+  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
+  vectorl_ntt(&values->s1_ntt);
+
+  matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt);
+  vectork_inverse_ntt(&values->t);
+  vectork_add(&values->t, &values->t, &priv->s2);
+
+  vectork_power2_round(&values->pub.t1, &priv->t0, &values->t);
+  // t1 is public.
+  CONSTTIME_DECLASSIFY(&values->pub.t1, sizeof(values->pub.t1));
+
+  CBB cbb;
+  CBB_init_fixed(&cbb, out_encoded_public_key, DILITHIUM_PUBLIC_KEY_BYTES);
+  if (!dilithium_marshal_public_key(&cbb, &values->pub)) {
+    goto err;
+  }
+
+  BORINGSSL_keccak(priv->public_key_hash, sizeof(priv->public_key_hash),
+                   out_encoded_public_key, DILITHIUM_PUBLIC_KEY_BYTES,
+                   boringssl_shake256);
+
+  ret = 1;
+err:
+  OPENSSL_free(values);
+  return ret;
+}
+
+int DILITHIUM_public_from_private(
+    struct DILITHIUM_public_key *out_public_key,
+    const struct DILITHIUM_private_key *private_key) {
+  int ret = 0;
+
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    matrix a_ntt;
+    vectorl s1_ntt;
+    vectork t;
+    vectork t0;
+  };
+  struct values_st *values = OPENSSL_malloc(sizeof(*values));
+  if (values == NULL) {
+    goto err;
+  }
+
+  const struct private_key *priv = private_key_from_external(private_key);
+  struct public_key *pub = public_key_from_external(out_public_key);
+
+  OPENSSL_memcpy(pub->rho, priv->rho, sizeof(pub->rho));
+  OPENSSL_memcpy(pub->public_key_hash, priv->public_key_hash,
+                 sizeof(pub->public_key_hash));
+
+  matrix_expand(&values->a_ntt, priv->rho);
+
+  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
+  vectorl_ntt(&values->s1_ntt);
+
+  matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt);
+  vectork_inverse_ntt(&values->t);
+  vectork_add(&values->t, &values->t, &priv->s2);
+
+  vectork_power2_round(&pub->t1, &values->t0, &values->t);
+
+  ret = 1;
+err:
+  OPENSSL_free(values);
+  return ret;
+}
+
+// FIPS 204, Algorithm 2 (`ML-DSA.Sign`). Returns 1 on success and 0 on failure.
+static int dilithium_sign_with_randomizer(
+    uint8_t out_encoded_signature[DILITHIUM_SIGNATURE_BYTES],
+    const struct DILITHIUM_private_key *private_key, const uint8_t *msg,
+    size_t msg_len,
+    const uint8_t randomizer[DILITHIUM_SIGNATURE_RANDOMIZER_BYTES]) {
+  int ret = 0;
+
+  const struct private_key *priv = private_key_from_external(private_key);
+
+  uint8_t mu[MU_BYTES];
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, priv->public_key_hash,
+                          sizeof(priv->public_key_hash));
+  BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, mu, MU_BYTES);
+
+  uint8_t rho_prime[RHO_PRIME_BYTES];
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, priv->k, sizeof(priv->k));
+  BORINGSSL_keccak_absorb(&keccak_ctx, randomizer,
+                          DILITHIUM_SIGNATURE_RANDOMIZER_BYTES);
+  BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, rho_prime, RHO_PRIME_BYTES);
+
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    struct signature sign;
+    vectorl s1_ntt;
+    vectork s2_ntt;
+    vectork t0_ntt;
+    matrix a_ntt;
+    vectorl y;
+    vectorl y_ntt;
+    vectork w;
+    vectork w1;
+    vectorl cs1;
+    vectork cs2;
+    vectork r0;
+    vectork ct0;
+  };
+  struct values_st *values = OPENSSL_malloc(sizeof(*values));
+  if (values == NULL) {
+    goto err;
+  }
+  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
+  vectorl_ntt(&values->s1_ntt);
+
+  OPENSSL_memcpy(&values->s2_ntt, &priv->s2, sizeof(values->s2_ntt));
+  vectork_ntt(&values->s2_ntt);
+
+  OPENSSL_memcpy(&values->t0_ntt, &priv->t0, sizeof(values->t0_ntt));
+  vectork_ntt(&values->t0_ntt);
+
+  matrix_expand(&values->a_ntt, priv->rho);
+
+  for (size_t kappa = 0;; kappa += L) {
+    // TODO(bbe): y only lives long enough to compute y_ntt.
+    // consider using another vectorl to save memory.
+    vectorl_expand_mask(&values->y, rho_prime, kappa);
+
+    OPENSSL_memcpy(&values->y_ntt, &values->y, sizeof(values->y_ntt));
+    vectorl_ntt(&values->y_ntt);
+
+    // TODO(bbe): w only lives long enough to compute y_ntt.
+    // consider using another vectork to save memory.
+    matrix_mult(&values->w, &values->a_ntt, &values->y_ntt);
+    vectork_inverse_ntt(&values->w);
+
+    vectork_high_bits(&values->w1, &values->w);
+    uint8_t w1_encoded[128 * K];
+    w1_encode(w1_encoded, &values->w1);
+
+    BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+    BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES);
+    BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K);
+    BORINGSSL_keccak_squeeze(&keccak_ctx, values->sign.c_tilde,
+                             2 * LAMBDA_BYTES);
+
+    scalar c_ntt;
+    scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde, 32);
+    scalar_ntt(&c_ntt);
+
+    vectorl_mult_scalar(&values->cs1, &values->s1_ntt, &c_ntt);
+    vectorl_inverse_ntt(&values->cs1);
+    vectork_mult_scalar(&values->cs2, &values->s2_ntt, &c_ntt);
+    vectork_inverse_ntt(&values->cs2);
+
+    vectorl_add(&values->sign.z, &values->y, &values->cs1);
+
+    vectork_sub(&values->r0, &values->w, &values->cs2);
+    vectork_low_bits(&values->r0, &values->r0);
+
+    // Leaking the fact that a signature was rejected is fine as the next
+    // attempt at a signature will be (indistinguishable from) independent of
+    // this one. Note, however, that we additionally leak which of the two
+    // branches rejected the signature. Section 5.5 of
+    // https://pq-crystals.org/dilithium/data/dilithium-specification-round3.pdf
+    // describes this leak as OK. Note we leak less than what is described by
+    // the paper; we do not reveal which coefficient violated the bound, and we
+    // hide which of the |z_max| or |r0_max| bound failed. See also
+    // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/2bbab0fa_d241d35a/
+    uint32_t z_max = vectorl_max(&values->sign.z);
+    uint32_t r0_max = vectork_max_signed(&values->r0);
+    if (constant_time_declassify_w(
+            constant_time_ge_w(z_max, kGamma1 - BETA) |
+            constant_time_ge_w(r0_max, kGamma2 - BETA))) {
+      continue;
+    }
+
+    vectork_mult_scalar(&values->ct0, &values->t0_ntt, &c_ntt);
+    vectork_inverse_ntt(&values->ct0);
+    vectork_make_hint(&values->sign.h, &values->ct0, &values->cs2, &values->w);
+
+    // See above.
+    uint32_t ct0_max = vectork_max(&values->ct0);
+    size_t h_ones = vectork_count_ones(&values->sign.h);
+    if (constant_time_declassify_w(constant_time_ge_w(ct0_max, kGamma2) |
+                                   constant_time_lt_w(OMEGA, h_ones))) {
+      continue;
+    }
+
+    // Although computed with the private key, the signature is public.
+    CONSTTIME_DECLASSIFY(values->sign.c_tilde, sizeof(values->sign.c_tilde));
+    CONSTTIME_DECLASSIFY(&values->sign.z, sizeof(values->sign.z));
+    CONSTTIME_DECLASSIFY(&values->sign.h, sizeof(values->sign.h));
+
+    CBB cbb;
+    CBB_init_fixed(&cbb, out_encoded_signature, DILITHIUM_SIGNATURE_BYTES);
+    if (!dilithium_marshal_signature(&cbb, &values->sign)) {
+      goto err;
+    }
+
+    BSSL_CHECK(CBB_len(&cbb) == DILITHIUM_SIGNATURE_BYTES);
+    ret = 1;
+    break;
+  }
+
+err:
+  OPENSSL_free(values);
+  return ret;
+}
+
+// Dilithium signature in deterministic mode. Returns 1 on success and 0 on
+// failure.
+int DILITHIUM_sign_deterministic(
+    uint8_t out_encoded_signature[DILITHIUM_SIGNATURE_BYTES],
+    const struct DILITHIUM_private_key *private_key, const uint8_t *msg,
+    size_t msg_len) {
+  uint8_t randomizer[DILITHIUM_SIGNATURE_RANDOMIZER_BYTES];
+  OPENSSL_memset(randomizer, 0, sizeof(randomizer));
+  return dilithium_sign_with_randomizer(out_encoded_signature, private_key, msg,
+                                        msg_len, randomizer);
+}
+
+// Dilithium signature in randomized mode, filling the random bytes with
+// |RAND_bytes|. Returns 1 on success and 0 on failure.
+int DILITHIUM_sign(uint8_t out_encoded_signature[DILITHIUM_SIGNATURE_BYTES],
+                   const struct DILITHIUM_private_key *private_key,
+                   const uint8_t *msg, size_t msg_len) {
+  uint8_t randomizer[DILITHIUM_SIGNATURE_RANDOMIZER_BYTES];
+  RAND_bytes(randomizer, sizeof(randomizer));
+  return dilithium_sign_with_randomizer(out_encoded_signature, private_key, msg,
+                                        msg_len, randomizer);
+}
+
+// FIPS 204, Algorithm 3 (`ML-DSA.Verify`).
+int DILITHIUM_verify(const struct DILITHIUM_public_key *public_key,
+                     const uint8_t encoded_signature[DILITHIUM_SIGNATURE_BYTES],
+                     const uint8_t *msg, size_t msg_len) {
+  int ret = 0;
+
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    struct signature sign;
+    matrix a_ntt;
+    vectorl z_ntt;
+    vectork az_ntt;
+    vectork t1_ntt;
+    vectork ct1_ntt;
+    vectork w_approx;
+    vectork w1;
+  };
+  struct values_st *values = OPENSSL_malloc(sizeof(*values));
+  if (values == NULL) {
+    goto err;
+  }
+
+  const struct public_key *pub = public_key_from_external(public_key);
+
+  CBS cbs;
+  CBS_init(&cbs, encoded_signature, DILITHIUM_SIGNATURE_BYTES);
+  if (!dilithium_parse_signature(&values->sign, &cbs)) {
+    goto err;
+  }
+
+  matrix_expand(&values->a_ntt, pub->rho);
+
+  uint8_t mu[MU_BYTES];
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, pub->public_key_hash,
+                          sizeof(pub->public_key_hash));
+  BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, mu, MU_BYTES);
+
+  scalar c_ntt;
+  scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde, 32);
+  scalar_ntt(&c_ntt);
+
+  OPENSSL_memcpy(&values->z_ntt, &values->sign.z, sizeof(values->z_ntt));
+  vectorl_ntt(&values->z_ntt);
+
+  matrix_mult(&values->az_ntt, &values->a_ntt, &values->z_ntt);
+
+  vectork_scale_power2_round(&values->t1_ntt, &pub->t1);
+  vectork_ntt(&values->t1_ntt);
+
+  vectork_mult_scalar(&values->ct1_ntt, &values->t1_ntt, &c_ntt);
+
+  vectork_sub(&values->w_approx, &values->az_ntt, &values->ct1_ntt);
+  vectork_inverse_ntt(&values->w_approx);
+
+  vectork_use_hint_vartime(&values->w1, &values->sign.h, &values->w_approx);
+  uint8_t w1_encoded[128 * K];
+  w1_encode(w1_encoded, &values->w1);
+
+  uint8_t c_tilde[2 * LAMBDA_BYTES];
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES);
+  BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, c_tilde, 2 * LAMBDA_BYTES);
+
+  uint32_t z_max = vectorl_max(&values->sign.z);
+  size_t h_ones = vectork_count_ones(&values->sign.h);
+  if (z_max < kGamma1 - BETA && h_ones <= OMEGA &&
+      OPENSSL_memcmp(c_tilde, values->sign.c_tilde, 2 * LAMBDA_BYTES) == 0) {
+    ret = 1;
+  }
+
+err:
+  OPENSSL_free(values);
+  return ret;
+}
+
+/* Serialization of keys. */
+
+int DILITHIUM_marshal_public_key(
+    CBB *out, const struct DILITHIUM_public_key *public_key) {
+  return dilithium_marshal_public_key(out,
+                                      public_key_from_external(public_key));
+}
+
+int DILITHIUM_parse_public_key(struct DILITHIUM_public_key *public_key,
+                               CBS *in) {
+  struct public_key *pub = public_key_from_external(public_key);
+  CBS orig_in = *in;
+  if (!dilithium_parse_public_key(pub, in) || CBS_len(in) != 0) {
+    return 0;
+  }
+
+  // Compute pre-cached values.
+  BORINGSSL_keccak(pub->public_key_hash, sizeof(pub->public_key_hash),
+                   CBS_data(&orig_in), CBS_len(&orig_in), boringssl_shake256);
+  return 1;
+}
+
+int DILITHIUM_marshal_private_key(
+    CBB *out, const struct DILITHIUM_private_key *private_key) {
+  return dilithium_marshal_private_key(out,
+                                       private_key_from_external(private_key));
+}
+
+int DILITHIUM_parse_private_key(struct DILITHIUM_private_key *private_key,
+                                CBS *in) {
+  struct private_key *priv = private_key_from_external(private_key);
+  return dilithium_parse_private_key(priv, in) && CBS_len(in) == 0;
+}
diff --git a/Sources/CCryptoBoringSSL/crypto/dilithium/internal.h b/Sources/CCryptoBoringSSL/crypto/dilithium/internal.h
new file mode 100644
index 00000000..a8a7b3d1
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/dilithium/internal.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2023, Google LLC
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_CRYPTO_DILITHIUM_INTERNAL_H
+#define OPENSSL_HEADER_CRYPTO_DILITHIUM_INTERNAL_H
+
+#include <CCryptoBoringSSL_base.h>
+#include <experimental/CCryptoBoringSSL_dilithium.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+// DILITHIUM_GENERATE_KEY_ENTROPY is the number of bytes of uniformly random
+// entropy necessary to generate a key pair.
+#define DILITHIUM_GENERATE_KEY_ENTROPY 32
+
+// DILITHIUM_SIGNATURE_RANDOMIZER_BYTES is the number of bytes of uniformly
+// random entropy necessary to generate a signature in randomized mode.
+#define DILITHIUM_SIGNATURE_RANDOMIZER_BYTES 32
+
+// DILITHIUM_generate_key_external_entropy generates a public/private key pair
+// using the given seed, writes the encoded public key to
+// |out_encoded_public_key| and sets |out_private_key| to the private key,
+// returning 1 on success and 0 on failure. Returns 1 on success and 0 on
+// failure.
+OPENSSL_EXPORT int DILITHIUM_generate_key_external_entropy(
+    uint8_t out_encoded_public_key[DILITHIUM_PUBLIC_KEY_BYTES],
+    struct DILITHIUM_private_key *out_private_key,
+    const uint8_t entropy[DILITHIUM_GENERATE_KEY_ENTROPY]);
+
+// DILITHIUM_sign_deterministic generates a signature for the message |msg| of
+// length |msg_len| using |private_key| following the deterministic algorithm,
+// and writes the encoded signature to |out_encoded_signature|. Returns 1 on
+// success and 0 on failure.
+OPENSSL_EXPORT int DILITHIUM_sign_deterministic(
+    uint8_t out_encoded_signature[DILITHIUM_SIGNATURE_BYTES],
+    const struct DILITHIUM_private_key *private_key, const uint8_t *msg,
+    size_t msg_len);
+
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
+
+#endif  // OPENSSL_HEADER_CRYPTO_DILITHIUM_INTERNAL_H
diff --git a/Sources/CCryptoBoringSSL/crypto/dsa/dsa.c b/Sources/CCryptoBoringSSL/crypto/dsa/dsa.c
index 269fe6fb..40be40e7 100644
--- a/Sources/CCryptoBoringSSL/crypto/dsa/dsa.c
+++ b/Sources/CCryptoBoringSSL/crypto/dsa/dsa.c
@@ -208,6 +208,11 @@ int DSA_set0_pqg(DSA *dsa, BIGNUM *p, BIGNUM *q, BIGNUM *g) {
 int DSA_generate_parameters_ex(DSA *dsa, unsigned bits, const uint8_t *seed_in,
                                size_t seed_len, int *out_counter,
                                unsigned long *out_h, BN_GENCB *cb) {
+  if (bits > OPENSSL_DSA_MAX_MODULUS_BITS) {
+    OPENSSL_PUT_ERROR(DSA, DSA_R_INVALID_PARAMETERS);
+    return 0;
+  }
+
   int ok = 0;
   unsigned char seed[SHA256_DIGEST_LENGTH];
   unsigned char md[SHA256_DIGEST_LENGTH];
@@ -274,6 +279,8 @@ int DSA_generate_parameters_ex(DSA *dsa, unsigned bits, const uint8_t *seed_in,
         if (!RAND_bytes(seed, qsize)) {
           goto err;
         }
+        // DSA parameters are public.
+        CONSTTIME_DECLASSIFY(seed, qsize);
       } else {
         // If we come back through, use random seed next time.
         seed_in = NULL;
@@ -477,11 +484,13 @@ DSA *DSAparams_dup(const DSA *dsa) {
 }
 
 int DSA_generate_key(DSA *dsa) {
+  if (!dsa_check_key(dsa)) {
+    return 0;
+  }
+
   int ok = 0;
-  BN_CTX *ctx = NULL;
   BIGNUM *pub_key = NULL, *priv_key = NULL;
-
-  ctx = BN_CTX_new();
+  BN_CTX *ctx = BN_CTX_new();
   if (ctx == NULL) {
     goto err;
   }
@@ -513,6 +522,9 @@ int DSA_generate_key(DSA *dsa) {
     goto err;
   }
 
+  // The public key is computed from the private key, but is public.
+  bn_declassify(pub_key);
+
   dsa->priv_key = priv_key;
   dsa->pub_key = pub_key;
   ok = 1;
@@ -649,6 +661,10 @@ DSA_SIG *DSA_do_sign(const uint8_t *digest, size_t digest_len, const DSA *dsa) {
     goto err;
   }
 
+  // The signature is computed from the private key, but is public.
+  bn_declassify(r);
+  bn_declassify(s);
+
   // Redo if r or s is zero as required by FIPS 186-3: this is
   // very unlikely.
   if (BN_is_zero(r) || BN_is_zero(s)) {
@@ -681,7 +697,7 @@ DSA_SIG *DSA_do_sign(const uint8_t *digest, size_t digest_len, const DSA *dsa) {
   return ret;
 }
 
-int DSA_do_verify(const uint8_t *digest, size_t digest_len, DSA_SIG *sig,
+int DSA_do_verify(const uint8_t *digest, size_t digest_len, const DSA_SIG *sig,
                   const DSA *dsa) {
   int valid;
   if (!DSA_do_check_signature(&valid, digest, digest_len, sig, dsa)) {
@@ -691,7 +707,8 @@ int DSA_do_verify(const uint8_t *digest, size_t digest_len, DSA_SIG *sig,
 }
 
 int DSA_do_check_signature(int *out_valid, const uint8_t *digest,
-                           size_t digest_len, DSA_SIG *sig, const DSA *dsa) {
+                           size_t digest_len, const DSA_SIG *sig,
+                           const DSA *dsa) {
   *out_valid = 0;
   if (!dsa_check_key(dsa)) {
     return 0;
@@ -899,15 +916,19 @@ static int dsa_sign_setup(const DSA *dsa, BN_CTX *ctx, BIGNUM **out_kinv,
                               ctx) ||
       // Compute r = (g^k mod p) mod q
       !BN_mod_exp_mont_consttime(r, dsa->g, &k, dsa->p, ctx,
-                                 dsa->method_mont_p) ||
-      // Note |BN_mod| below is not constant-time and may leak information about
-      // |r|. |dsa->p| may be significantly larger than |dsa->q|, so this is not
-      // easily performed in constant-time with Montgomery reduction.
-      //
-      // However, |r| at this point is g^k (mod p). It is almost the value of
-      // |r| revealed in the signature anyway (g^k (mod p) (mod q)), going from
-      // it to |k| would require computing a discrete log.
-      !BN_mod(r, r, dsa->q, ctx) ||
+                                 dsa->method_mont_p)) {
+    OPENSSL_PUT_ERROR(DSA, ERR_R_BN_LIB);
+    goto err;
+  }
+  // Note |BN_mod| below is not constant-time and may leak information about
+  // |r|. |dsa->p| may be significantly larger than |dsa->q|, so this is not
+  // easily performed in constant-time with Montgomery reduction.
+  //
+  // However, |r| at this point is g^k (mod p). It is almost the value of |r|
+  // revealed in the signature anyway (g^k (mod p) (mod q)), going from it to
+  // |k| would require computing a discrete log.
+  bn_declassify(r);
+  if (!BN_mod(r, r, dsa->q, ctx) ||
       // Compute part of 's = inv(k) (m + xr) mod q' using Fermat's Little
       // Theorem.
       !bn_mod_inverse_prime(kinv, &k, dsa->q, ctx, dsa->method_mont_q)) {
diff --git a/Sources/CCryptoBoringSSL/crypto/dsa/dsa_asn1.c b/Sources/CCryptoBoringSSL/crypto/dsa/dsa_asn1.c
index 9c6dfaed..010c3465 100644
--- a/Sources/CCryptoBoringSSL/crypto/dsa/dsa_asn1.c
+++ b/Sources/CCryptoBoringSSL/crypto/dsa/dsa_asn1.c
@@ -65,8 +65,6 @@
 #include "../bytestring/internal.h"
 
 
-#define OPENSSL_DSA_MAX_MODULUS_BITS 10000
-
 // This function is in dsa_asn1.c rather than dsa.c because it is reachable from
 // |EVP_PKEY| parsers. This makes it easier for the static linker to drop most
 // of the DSA implementation.
@@ -119,8 +117,9 @@ int dsa_check_key(const DSA *dsa) {
   if (dsa->priv_key != NULL) {
     // The private key is a non-zero element of the scalar field, determined by
     // |q|.
-    if (BN_is_negative(dsa->priv_key) || BN_is_zero(dsa->priv_key) ||
-        BN_cmp(dsa->priv_key, dsa->q) >= 0) {
+    if (BN_is_negative(dsa->priv_key) ||
+        constant_time_declassify_int(BN_is_zero(dsa->priv_key)) ||
+        constant_time_declassify_int(BN_cmp(dsa->priv_key, dsa->q) >= 0)) {
       OPENSSL_PUT_ERROR(DSA, DSA_R_INVALID_PARAMETERS);
       return 0;
     }
diff --git a/Sources/CCryptoBoringSSL/crypto/dsa/internal.h b/Sources/CCryptoBoringSSL/crypto/dsa/internal.h
index eb537d6f..44a40934 100644
--- a/Sources/CCryptoBoringSSL/crypto/dsa/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/dsa/internal.h
@@ -42,6 +42,8 @@ struct dsa_st {
   CRYPTO_EX_DATA ex_data;
 };
 
+#define OPENSSL_DSA_MAX_MODULUS_BITS 10000
+
 // dsa_check_key performs cheap self-checks on |dsa|, and ensures it is within
 // DoS bounds. It returns one on success and zero on error.
 int dsa_check_key(const DSA *dsa);
diff --git a/Sources/CCryptoBoringSSL/crypto/ecdsa_extra/ecdsa_asn1.c b/Sources/CCryptoBoringSSL/crypto/ecdsa_extra/ecdsa_asn1.c
index 3650858e..11c9deb9 100644
--- a/Sources/CCryptoBoringSSL/crypto/ecdsa_extra/ecdsa_asn1.c
+++ b/Sources/CCryptoBoringSSL/crypto/ecdsa_extra/ecdsa_asn1.c
@@ -62,34 +62,87 @@
 #include <CCryptoBoringSSL_mem.h>
 
 #include "../bytestring/internal.h"
-#include "../fipsmodule/ec/internal.h"
+#include "../fipsmodule/ecdsa/internal.h"
 #include "../internal.h"
 
 
+static ECDSA_SIG *ecdsa_sig_from_fixed(const EC_KEY *key, const uint8_t *in,
+                                       size_t len) {
+  const EC_GROUP *group = EC_KEY_get0_group(key);
+  if (group == NULL) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER);
+    return NULL;
+  }
+  size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group));
+  if (len != 2 * scalar_len) {
+    OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
+    return NULL;
+  }
+  ECDSA_SIG *ret = ECDSA_SIG_new();
+  if (ret == NULL ||
+      !BN_bin2bn(in, scalar_len, ret->r) ||
+      !BN_bin2bn(in + scalar_len, scalar_len, ret->s)) {
+    ECDSA_SIG_free(ret);
+    return NULL;
+  }
+  return ret;
+}
+
+static int ecdsa_sig_to_fixed(const EC_KEY *key, uint8_t *out, size_t *out_len,
+                              size_t max_out, const ECDSA_SIG *sig) {
+  const EC_GROUP *group = EC_KEY_get0_group(key);
+  if (group == NULL) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER);
+    return 0;
+  }
+  size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group));
+  if (max_out < 2 * scalar_len) {
+    OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL);
+    return 0;
+  }
+  if (BN_is_negative(sig->r) ||
+      !BN_bn2bin_padded(out, scalar_len, sig->r) ||
+      BN_is_negative(sig->s) ||
+      !BN_bn2bin_padded(out + scalar_len, scalar_len, sig->s)) {
+    OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
+    return 0;
+  }
+  *out_len = 2 * scalar_len;
+  return 1;
+}
+
 int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig,
-               unsigned int *sig_len, const EC_KEY *eckey) {
+               unsigned int *out_sig_len, const EC_KEY *eckey) {
   if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) {
-    return eckey->ecdsa_meth->sign(digest, digest_len, sig, sig_len,
+    return eckey->ecdsa_meth->sign(digest, digest_len, sig, out_sig_len,
                                    (EC_KEY*) eckey /* cast away const */);
   }
 
-  int ret = 0;
-  ECDSA_SIG *s = ECDSA_do_sign(digest, digest_len, eckey);
+  *out_sig_len = 0;
+  uint8_t fixed[ECDSA_MAX_FIXED_LEN];
+  size_t fixed_len;
+  if (!ecdsa_sign_fixed(digest, digest_len, fixed, &fixed_len, sizeof(fixed),
+                        eckey)) {
+    return 0;
+  }
+
+  // TODO(davidben): We can actually do better and go straight from the DER
+  // format to the fixed-width format without a malloc.
+  ECDSA_SIG *s = ecdsa_sig_from_fixed(eckey, fixed, fixed_len);
   if (s == NULL) {
-    *sig_len = 0;
-    goto err;
+    return 0;
   }
 
+  int ret = 0;
   CBB cbb;
   CBB_init_fixed(&cbb, sig, ECDSA_size(eckey));
   size_t len;
   if (!ECDSA_SIG_marshal(&cbb, s) ||
       !CBB_finish(&cbb, NULL, &len)) {
     OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_ENCODE_ERROR);
-    *sig_len = 0;
     goto err;
   }
-  *sig_len = (unsigned)len;
+  *out_sig_len = (unsigned)len;
   ret = 1;
 
 err:
@@ -99,12 +152,13 @@ int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig,
 
 int ECDSA_verify(int type, const uint8_t *digest, size_t digest_len,
                  const uint8_t *sig, size_t sig_len, const EC_KEY *eckey) {
-  ECDSA_SIG *s;
+  // Decode the ECDSA signature.
+  //
+  // TODO(davidben): We can actually do better and go straight from the DER
+  // format to the fixed-width format without a malloc.
   int ret = 0;
   uint8_t *der = NULL;
-
-  // Decode the ECDSA signature.
-  s = ECDSA_SIG_from_bytes(sig, sig_len);
+  ECDSA_SIG *s = ECDSA_SIG_from_bytes(sig, sig_len);
   if (s == NULL) {
     goto err;
   }
@@ -118,7 +172,10 @@ int ECDSA_verify(int type, const uint8_t *digest, size_t digest_len,
     goto err;
   }
 
-  ret = ECDSA_do_verify(digest, digest_len, s, eckey);
+  uint8_t fixed[ECDSA_MAX_FIXED_LEN];
+  size_t fixed_len;
+  ret = ecdsa_sig_to_fixed(eckey, fixed, &fixed_len, sizeof(fixed), s) &&
+        ecdsa_verify_fixed(digest, digest_len, fixed, fixed_len, eckey);
 
 err:
   OPENSSL_free(der);
@@ -147,6 +204,95 @@ size_t ECDSA_size(const EC_KEY *key) {
   return ECDSA_SIG_max_len(group_order_size);
 }
 
+ECDSA_SIG *ECDSA_SIG_new(void) {
+  ECDSA_SIG *sig = OPENSSL_malloc(sizeof(ECDSA_SIG));
+  if (sig == NULL) {
+    return NULL;
+  }
+  sig->r = BN_new();
+  sig->s = BN_new();
+  if (sig->r == NULL || sig->s == NULL) {
+    ECDSA_SIG_free(sig);
+    return NULL;
+  }
+  return sig;
+}
+
+void ECDSA_SIG_free(ECDSA_SIG *sig) {
+  if (sig == NULL) {
+    return;
+  }
+
+  BN_free(sig->r);
+  BN_free(sig->s);
+  OPENSSL_free(sig);
+}
+
+const BIGNUM *ECDSA_SIG_get0_r(const ECDSA_SIG *sig) {
+  return sig->r;
+}
+
+const BIGNUM *ECDSA_SIG_get0_s(const ECDSA_SIG *sig) {
+  return sig->s;
+}
+
+void ECDSA_SIG_get0(const ECDSA_SIG *sig, const BIGNUM **out_r,
+                    const BIGNUM **out_s) {
+  if (out_r != NULL) {
+    *out_r = sig->r;
+  }
+  if (out_s != NULL) {
+    *out_s = sig->s;
+  }
+}
+
+int ECDSA_SIG_set0(ECDSA_SIG *sig, BIGNUM *r, BIGNUM *s) {
+  if (r == NULL || s == NULL) {
+    return 0;
+  }
+  BN_free(sig->r);
+  BN_free(sig->s);
+  sig->r = r;
+  sig->s = s;
+  return 1;
+}
+
+int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
+                    const ECDSA_SIG *sig, const EC_KEY *eckey) {
+  uint8_t fixed[ECDSA_MAX_FIXED_LEN];
+  size_t fixed_len;
+  return ecdsa_sig_to_fixed(eckey, fixed, &fixed_len, sizeof(fixed), sig) &&
+         ecdsa_verify_fixed(digest, digest_len, fixed, fixed_len, eckey);
+}
+
+// This function is only exported for testing and is not called in production
+// code.
+ECDSA_SIG *ECDSA_sign_with_nonce_and_leak_private_key_for_testing(
+    const uint8_t *digest, size_t digest_len, const EC_KEY *eckey,
+    const uint8_t *nonce, size_t nonce_len) {
+  uint8_t sig[ECDSA_MAX_FIXED_LEN];
+  size_t sig_len;
+  if (!ecdsa_sign_fixed_with_nonce_for_known_answer_test(
+          digest, digest_len, sig, &sig_len, sizeof(sig), eckey, nonce,
+          nonce_len)) {
+    return NULL;
+  }
+
+  return ecdsa_sig_from_fixed(eckey, sig, sig_len);
+}
+
+ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len,
+                         const EC_KEY *eckey) {
+  uint8_t sig[ECDSA_MAX_FIXED_LEN];
+  size_t sig_len;
+  if (!ecdsa_sign_fixed(digest, digest_len, sig, &sig_len, sizeof(sig),
+                        eckey)) {
+    return NULL;
+  }
+
+  return ecdsa_sig_from_fixed(eckey, sig, sig_len);
+}
+
 ECDSA_SIG *ECDSA_SIG_parse(CBS *cbs) {
   ECDSA_SIG *ret = ECDSA_SIG_new();
   if (ret == NULL) {
diff --git a/Sources/CCryptoBoringSSL/crypto/err/err.c b/Sources/CCryptoBoringSSL/crypto/err/err.c
index 5504c6a2..36a07eac 100644
--- a/Sources/CCryptoBoringSSL/crypto/err/err.c
+++ b/Sources/CCryptoBoringSSL/crypto/err/err.c
@@ -164,6 +164,17 @@ extern const uint32_t kOpenSSLReasonValues[];
 extern const size_t kOpenSSLReasonValuesLen;
 extern const char kOpenSSLReasonStringData[];
 
+static char *strdup_libc_malloc(const char *str) {
+  // |strdup| is not in C until C23, so MSVC triggers deprecation warnings, and
+  // glibc and musl gate it on a feature macro. Reimplementing it is easier.
+  size_t len = strlen(str);
+  char *ret = malloc(len + 1);
+  if (ret != NULL) {
+    memcpy(ret, str, len + 1);
+  }
+  return ret;
+}
+
 // err_clear clears the given queued error.
 static void err_clear(struct err_error_st *error) {
   free(error->data);
@@ -174,13 +185,9 @@ static void err_copy(struct err_error_st *dst, const struct err_error_st *src) {
   err_clear(dst);
   dst->file = src->file;
   if (src->data != NULL) {
-    // Disable deprecated functions on msvc so it doesn't complain about strdup.
-    OPENSSL_MSVC_PRAGMA(warning(push))
-    OPENSSL_MSVC_PRAGMA(warning(disable : 4996))
     // We can't use OPENSSL_strdup because we don't want to call OPENSSL_malloc,
     // which can affect the error stack.
-    dst->data = strdup(src->data);
-    OPENSSL_MSVC_PRAGMA(warning(pop))
+    dst->data = strdup_libc_malloc(src->data);
   }
   dst->packed = src->packed;
   dst->line = src->line;
@@ -767,13 +774,9 @@ void ERR_set_error_data(char *data, int flags) {
     assert(0);
     return;
   }
-  // Disable deprecated functions on msvc so it doesn't complain about strdup.
-  OPENSSL_MSVC_PRAGMA(warning(push))
-  OPENSSL_MSVC_PRAGMA(warning(disable : 4996))
   // We can not use OPENSSL_strdup because we don't want to call OPENSSL_malloc,
   // which can affect the error stack.
-  char *copy = strdup(data);
-  OPENSSL_MSVC_PRAGMA(warning(pop))
+  char *copy = strdup_libc_malloc(data);
   if (copy != NULL) {
     err_set_error_data(copy);
   }
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/evp.c b/Sources/CCryptoBoringSSL/crypto/evp/evp.c
index aa31111a..db9fae49 100644
--- a/Sources/CCryptoBoringSSL/crypto/evp/evp.c
+++ b/Sources/CCryptoBoringSSL/crypto/evp/evp.c
@@ -59,12 +59,9 @@
 #include <assert.h>
 #include <string.h>
 
-#include <CCryptoBoringSSL_dsa.h>
-#include <CCryptoBoringSSL_ec.h>
 #include <CCryptoBoringSSL_err.h>
 #include <CCryptoBoringSSL_mem.h>
 #include <CCryptoBoringSSL_nid.h>
-#include <CCryptoBoringSSL_rsa.h>
 #include <CCryptoBoringSSL_thread.h>
 
 #include "internal.h"
@@ -149,9 +146,7 @@ int EVP_PKEY_cmp(const EVP_PKEY *a, const EVP_PKEY *b) {
 
 int EVP_PKEY_copy_parameters(EVP_PKEY *to, const EVP_PKEY *from) {
   if (to->type == EVP_PKEY_NONE) {
-    if (!EVP_PKEY_set_type(to, from->type)) {
-      return 0;
-    }
+    evp_pkey_set_method(to, from->ameth);
   } else if (to->type != from->type) {
     OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_KEY_TYPES);
     return 0;
@@ -225,117 +220,21 @@ static const EVP_PKEY_ASN1_METHOD *evp_pkey_asn1_find(int nid) {
   }
 }
 
-static void evp_pkey_set_method(EVP_PKEY *pkey,
-                                const EVP_PKEY_ASN1_METHOD *method) {
+void evp_pkey_set_method(EVP_PKEY *pkey, const EVP_PKEY_ASN1_METHOD *method) {
   free_it(pkey);
   pkey->ameth = method;
   pkey->type = pkey->ameth->pkey_id;
 }
 
 int EVP_PKEY_type(int nid) {
-  const EVP_PKEY_ASN1_METHOD *meth = evp_pkey_asn1_find(nid);
-  if (meth == NULL) {
-    return NID_undef;
-  }
-  return meth->pkey_id;
-}
-
-int EVP_PKEY_set1_RSA(EVP_PKEY *pkey, RSA *key) {
-  if (EVP_PKEY_assign_RSA(pkey, key)) {
-    RSA_up_ref(key);
-    return 1;
-  }
-  return 0;
-}
-
-int EVP_PKEY_assign_RSA(EVP_PKEY *pkey, RSA *key) {
-  evp_pkey_set_method(pkey, &rsa_asn1_meth);
-  pkey->pkey = key;
-  return key != NULL;
-}
-
-RSA *EVP_PKEY_get0_RSA(const EVP_PKEY *pkey) {
-  if (pkey->type != EVP_PKEY_RSA) {
-    OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_RSA_KEY);
-    return NULL;
-  }
-  return pkey->pkey;
-}
-
-RSA *EVP_PKEY_get1_RSA(const EVP_PKEY *pkey) {
-  RSA *rsa = EVP_PKEY_get0_RSA(pkey);
-  if (rsa != NULL) {
-    RSA_up_ref(rsa);
-  }
-  return rsa;
+  // In OpenSSL, this was used to map between type aliases. BoringSSL supports
+  // no type aliases, so this function is just the identity.
+  return nid;
 }
 
-int EVP_PKEY_set1_DSA(EVP_PKEY *pkey, DSA *key) {
-  if (EVP_PKEY_assign_DSA(pkey, key)) {
-    DSA_up_ref(key);
-    return 1;
-  }
-  return 0;
-}
-
-int EVP_PKEY_assign_DSA(EVP_PKEY *pkey, DSA *key) {
-  evp_pkey_set_method(pkey, &dsa_asn1_meth);
-  pkey->pkey = key;
-  return key != NULL;
-}
-
-DSA *EVP_PKEY_get0_DSA(const EVP_PKEY *pkey) {
-  if (pkey->type != EVP_PKEY_DSA) {
-    OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_A_DSA_KEY);
-    return NULL;
-  }
-  return pkey->pkey;
-}
-
-DSA *EVP_PKEY_get1_DSA(const EVP_PKEY *pkey) {
-  DSA *dsa = EVP_PKEY_get0_DSA(pkey);
-  if (dsa != NULL) {
-    DSA_up_ref(dsa);
-  }
-  return dsa;
-}
-
-int EVP_PKEY_set1_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) {
-  if (EVP_PKEY_assign_EC_KEY(pkey, key)) {
-    EC_KEY_up_ref(key);
-    return 1;
-  }
-  return 0;
-}
-
-int EVP_PKEY_assign_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) {
-  evp_pkey_set_method(pkey, &ec_asn1_meth);
-  pkey->pkey = key;
-  return key != NULL;
-}
-
-EC_KEY *EVP_PKEY_get0_EC_KEY(const EVP_PKEY *pkey) {
-  if (pkey->type != EVP_PKEY_EC) {
-    OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_EC_KEY_KEY);
-    return NULL;
-  }
-  return pkey->pkey;
-}
-
-EC_KEY *EVP_PKEY_get1_EC_KEY(const EVP_PKEY *pkey) {
-  EC_KEY *ec_key = EVP_PKEY_get0_EC_KEY(pkey);
-  if (ec_key != NULL) {
-    EC_KEY_up_ref(ec_key);
-  }
-  return ec_key;
-}
-
-DH *EVP_PKEY_get0_DH(const EVP_PKEY *pkey) { return NULL; }
-DH *EVP_PKEY_get1_DH(const EVP_PKEY *pkey) { return NULL; }
-
 int EVP_PKEY_assign(EVP_PKEY *pkey, int type, void *key) {
-  // This function can only be used to assign RSA, DSA, and EC keys. Other key
-  // types have internal representations which are not exposed through the
+  // This function can only be used to assign RSA, DSA, EC, and DH keys. Other
+  // key types have internal representations which are not exposed through the
   // public API.
   switch (type) {
     case EVP_PKEY_RSA:
@@ -344,6 +243,8 @@ int EVP_PKEY_assign(EVP_PKEY *pkey, int type, void *key) {
       return EVP_PKEY_assign_DSA(pkey, key);
     case EVP_PKEY_EC:
       return EVP_PKEY_assign_EC_KEY(pkey, key);
+    case EVP_PKEY_DH:
+      return EVP_PKEY_assign_DH(pkey, key);
   }
 
   OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM);
@@ -375,16 +276,26 @@ int EVP_PKEY_set_type(EVP_PKEY *pkey, int type) {
 
 EVP_PKEY *EVP_PKEY_new_raw_private_key(int type, ENGINE *unused,
                                        const uint8_t *in, size_t len) {
-  EVP_PKEY *ret = EVP_PKEY_new();
-  if (ret == NULL ||
-      !EVP_PKEY_set_type(ret, type)) {
-    goto err;
+  // To avoid pulling in all key types, look for specifically the key types that
+  // support |set_priv_raw|.
+  const EVP_PKEY_ASN1_METHOD *method;
+  switch (type) {
+    case EVP_PKEY_X25519:
+      method = &x25519_asn1_meth;
+      break;
+    case EVP_PKEY_ED25519:
+      method = &ed25519_asn1_meth;
+      break;
+    default:
+      OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM);
+      return 0;
   }
 
-  if (ret->ameth->set_priv_raw == NULL) {
-    OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE);
+  EVP_PKEY *ret = EVP_PKEY_new();
+  if (ret == NULL) {
     goto err;
   }
+  evp_pkey_set_method(ret, method);
 
   if (!ret->ameth->set_priv_raw(ret, in, len)) {
     goto err;
@@ -399,16 +310,26 @@ EVP_PKEY *EVP_PKEY_new_raw_private_key(int type, ENGINE *unused,
 
 EVP_PKEY *EVP_PKEY_new_raw_public_key(int type, ENGINE *unused,
                                       const uint8_t *in, size_t len) {
-  EVP_PKEY *ret = EVP_PKEY_new();
-  if (ret == NULL ||
-      !EVP_PKEY_set_type(ret, type)) {
-    goto err;
+  // To avoid pulling in all key types, look for specifically the key types that
+  // support |set_pub_raw|.
+  const EVP_PKEY_ASN1_METHOD *method;
+  switch (type) {
+    case EVP_PKEY_X25519:
+      method = &x25519_asn1_meth;
+      break;
+    case EVP_PKEY_ED25519:
+      method = &ed25519_asn1_meth;
+      break;
+    default:
+      OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM);
+      return 0;
   }
 
-  if (ret->ameth->set_pub_raw == NULL) {
-    OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE);
+  EVP_PKEY *ret = EVP_PKEY_new();
+  if (ret == NULL) {
     goto err;
   }
+  evp_pkey_set_method(ret, method);
 
   if (!ret->ameth->set_pub_raw(ret, in, len)) {
     goto err;
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/evp_asn1.c b/Sources/CCryptoBoringSSL/crypto/evp/evp_asn1.c
index e2ad27f3..49413be4 100644
--- a/Sources/CCryptoBoringSSL/crypto/evp/evp_asn1.c
+++ b/Sources/CCryptoBoringSSL/crypto/evp/evp_asn1.c
@@ -69,6 +69,7 @@
 #include "../internal.h"
 
 
+// We intentionally omit |dh_asn1_meth| from this list. It is not serializable.
 static const EVP_PKEY_ASN1_METHOD *const kASN1Methods[] = {
     &rsa_asn1_meth,
     &ec_asn1_meth,
@@ -77,28 +78,26 @@ static const EVP_PKEY_ASN1_METHOD *const kASN1Methods[] = {
     &x25519_asn1_meth,
 };
 
-static int parse_key_type(CBS *cbs, int *out_type) {
+static const EVP_PKEY_ASN1_METHOD *parse_key_type(CBS *cbs) {
   CBS oid;
   if (!CBS_get_asn1(cbs, &oid, CBS_ASN1_OBJECT)) {
-    return 0;
+    return NULL;
   }
 
   for (unsigned i = 0; i < OPENSSL_ARRAY_SIZE(kASN1Methods); i++) {
     const EVP_PKEY_ASN1_METHOD *method = kASN1Methods[i];
     if (CBS_len(&oid) == method->oid_len &&
         OPENSSL_memcmp(CBS_data(&oid), method->oid, method->oid_len) == 0) {
-      *out_type = method->pkey_id;
-      return 1;
+      return method;
     }
   }
 
-  return 0;
+  return NULL;
 }
 
 EVP_PKEY *EVP_parse_public_key(CBS *cbs) {
   // Parse the SubjectPublicKeyInfo.
   CBS spki, algorithm, key;
-  int type;
   uint8_t padding;
   if (!CBS_get_asn1(cbs, &spki, CBS_ASN1_SEQUENCE) ||
       !CBS_get_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) ||
@@ -107,7 +106,8 @@ EVP_PKEY *EVP_parse_public_key(CBS *cbs) {
     OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
     return NULL;
   }
-  if (!parse_key_type(&algorithm, &type)) {
+  const EVP_PKEY_ASN1_METHOD *method = parse_key_type(&algorithm);
+  if (method == NULL) {
     OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM);
     return NULL;
   }
@@ -121,10 +121,10 @@ EVP_PKEY *EVP_parse_public_key(CBS *cbs) {
 
   // Set up an |EVP_PKEY| of the appropriate type.
   EVP_PKEY *ret = EVP_PKEY_new();
-  if (ret == NULL ||
-      !EVP_PKEY_set_type(ret, type)) {
+  if (ret == NULL) {
     goto err;
   }
+  evp_pkey_set_method(ret, method);
 
   // Call into the type-specific SPKI decoding function.
   if (ret->ameth->pub_decode == NULL) {
@@ -155,7 +155,6 @@ EVP_PKEY *EVP_parse_private_key(CBS *cbs) {
   // Parse the PrivateKeyInfo.
   CBS pkcs8, algorithm, key;
   uint64_t version;
-  int type;
   if (!CBS_get_asn1(cbs, &pkcs8, CBS_ASN1_SEQUENCE) ||
       !CBS_get_asn1_uint64(&pkcs8, &version) ||
       version != 0 ||
@@ -164,7 +163,8 @@ EVP_PKEY *EVP_parse_private_key(CBS *cbs) {
     OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
     return NULL;
   }
-  if (!parse_key_type(&algorithm, &type)) {
+  const EVP_PKEY_ASN1_METHOD *method = parse_key_type(&algorithm);
+  if (method == NULL) {
     OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM);
     return NULL;
   }
@@ -173,10 +173,10 @@ EVP_PKEY *EVP_parse_private_key(CBS *cbs) {
 
   // Set up an |EVP_PKEY| of the appropriate type.
   EVP_PKEY *ret = EVP_PKEY_new();
-  if (ret == NULL ||
-      !EVP_PKEY_set_type(ret, type)) {
+  if (ret == NULL) {
     goto err;
   }
+  evp_pkey_set_method(ret, method);
 
   // Call into the type-specific PrivateKeyInfo decoding function.
   if (ret->ameth->priv_decode == NULL) {
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/internal.h b/Sources/CCryptoBoringSSL/crypto/evp/internal.h
index d8e0f119..3a9785b4 100644
--- a/Sources/CCryptoBoringSSL/crypto/evp/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/evp/internal.h
@@ -213,6 +213,7 @@ OPENSSL_EXPORT int EVP_PKEY_CTX_ctrl(EVP_PKEY_CTX *ctx, int keytype, int optype,
 #define EVP_PKEY_CTRL_HKDF_KEY (EVP_PKEY_ALG_CTRL + 16)
 #define EVP_PKEY_CTRL_HKDF_SALT (EVP_PKEY_ALG_CTRL + 17)
 #define EVP_PKEY_CTRL_HKDF_INFO (EVP_PKEY_ALG_CTRL + 18)
+#define EVP_PKEY_CTRL_DH_PAD (EVP_PKEY_ALG_CTRL + 19)
 
 struct evp_pkey_ctx_st {
   // Method associated with this operation
@@ -288,12 +289,18 @@ extern const EVP_PKEY_ASN1_METHOD ec_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD ed25519_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD x25519_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
 
 extern const EVP_PKEY_METHOD rsa_pkey_meth;
 extern const EVP_PKEY_METHOD ec_pkey_meth;
 extern const EVP_PKEY_METHOD ed25519_pkey_meth;
 extern const EVP_PKEY_METHOD x25519_pkey_meth;
 extern const EVP_PKEY_METHOD hkdf_pkey_meth;
+extern const EVP_PKEY_METHOD dh_pkey_meth;
+
+// evp_pkey_set_method behaves like |EVP_PKEY_set_type|, but takes a pointer to
+// a method table. This avoids depending on every |EVP_PKEY_ASN1_METHOD|.
+void evp_pkey_set_method(EVP_PKEY *pkey, const EVP_PKEY_ASN1_METHOD *method);
 
 
 #if defined(__cplusplus)
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_dh.c b/Sources/CCryptoBoringSSL/crypto/evp/p_dh.c
new file mode 100644
index 00000000..5e901020
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/evp/p_dh.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2006-2019 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <CCryptoBoringSSL_evp.h>
+
+#include <assert.h>
+
+#include <CCryptoBoringSSL_dh.h>
+#include <CCryptoBoringSSL_err.h>
+#include <CCryptoBoringSSL_mem.h>
+
+#include "internal.h"
+
+
+typedef struct dh_pkey_ctx_st {
+  int pad;
+} DH_PKEY_CTX;
+
+static int pkey_dh_init(EVP_PKEY_CTX *ctx) {
+  DH_PKEY_CTX *dctx = OPENSSL_zalloc(sizeof(DH_PKEY_CTX));
+  if (dctx == NULL) {
+    return 0;
+  }
+
+  ctx->data = dctx;
+  return 1;
+}
+
+static int pkey_dh_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) {
+  if (!pkey_dh_init(dst)) {
+    return 0;
+  }
+
+  const DH_PKEY_CTX *sctx = src->data;
+  DH_PKEY_CTX *dctx = dst->data;
+  dctx->pad = sctx->pad;
+  return 1;
+}
+
+static void pkey_dh_cleanup(EVP_PKEY_CTX *ctx) {
+  OPENSSL_free(ctx->data);
+  ctx->data = NULL;
+}
+
+static int pkey_dh_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) {
+  DH *dh = DH_new();
+  if (dh == NULL || !EVP_PKEY_assign_DH(pkey, dh)) {
+    DH_free(dh);
+    return 0;
+  }
+
+  if (ctx->pkey != NULL && !EVP_PKEY_copy_parameters(pkey, ctx->pkey)) {
+    return 0;
+  }
+
+  return DH_generate_key(dh);
+}
+
+static int pkey_dh_derive(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *out_len) {
+  DH_PKEY_CTX *dctx = ctx->data;
+  if (ctx->pkey == NULL || ctx->peerkey == NULL) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET);
+    return 0;
+  }
+
+  DH *our_key = ctx->pkey->pkey;
+  DH *peer_key = ctx->peerkey->pkey;
+  if (our_key == NULL || peer_key == NULL) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET);
+    return 0;
+  }
+
+  const BIGNUM *pub_key = DH_get0_pub_key(peer_key);
+  if (pub_key == NULL) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET);
+    return 0;
+  }
+
+  if (out == NULL) {
+    *out_len = DH_size(our_key);
+    return 1;
+  }
+
+  if (*out_len < (size_t)DH_size(our_key)) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL);
+    return 0;
+  }
+
+  int ret = dctx->pad ? DH_compute_key_padded(out, pub_key, our_key)
+                      : DH_compute_key(out, pub_key, our_key);
+  if (ret < 0) {
+    return 0;
+  }
+
+  assert(ret <= DH_size(our_key));
+  *out_len = (size_t)ret;
+  return 1;
+}
+
+static int pkey_dh_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) {
+  DH_PKEY_CTX *dctx = ctx->data;
+  switch (type) {
+    case EVP_PKEY_CTRL_PEER_KEY:
+      // |EVP_PKEY_derive_set_peer| requires the key implement this command,
+      // even if it is a no-op.
+      return 1;
+
+    case EVP_PKEY_CTRL_DH_PAD:
+      dctx->pad = p1;
+      return 1;
+
+    default:
+      OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED);
+      return 0;
+  }
+}
+
+const EVP_PKEY_METHOD dh_pkey_meth = {
+    .pkey_id = EVP_PKEY_DH,
+    .init = pkey_dh_init,
+    .copy = pkey_dh_copy,
+    .cleanup = pkey_dh_cleanup,
+    .keygen = pkey_dh_keygen,
+    .derive = pkey_dh_derive,
+    .ctrl = pkey_dh_ctrl,
+};
+
+int EVP_PKEY_CTX_set_dh_pad(EVP_PKEY_CTX *ctx, int pad) {
+  return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DH, EVP_PKEY_OP_DERIVE,
+                           EVP_PKEY_CTRL_DH_PAD, pad, NULL);
+}
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_dh_asn1.c b/Sources/CCryptoBoringSSL/crypto/evp/p_dh_asn1.c
new file mode 100644
index 00000000..7132552e
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/evp/p_dh_asn1.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2006-2021 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <CCryptoBoringSSL_evp.h>
+
+#include <CCryptoBoringSSL_bn.h>
+#include <CCryptoBoringSSL_dh.h>
+#include <CCryptoBoringSSL_err.h>
+
+#include "internal.h"
+#include "../internal.h"
+
+
+static void dh_free(EVP_PKEY *pkey) {
+  DH_free(pkey->pkey);
+  pkey->pkey = NULL;
+}
+
+static int dh_size(const EVP_PKEY *pkey) { return DH_size(pkey->pkey); }
+
+static int dh_bits(const EVP_PKEY *pkey) { return DH_bits(pkey->pkey); }
+
+static int dh_param_missing(const EVP_PKEY *pkey) {
+  const DH *dh = pkey->pkey;
+  return dh == NULL || DH_get0_p(dh) == NULL || DH_get0_g(dh) == NULL;
+}
+
+static int dh_param_copy(EVP_PKEY *to, const EVP_PKEY *from) {
+  if (dh_param_missing(from)) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS);
+    return 0;
+  }
+
+  const DH *dh = from->pkey;
+  const BIGNUM *q_old = DH_get0_q(dh);
+  BIGNUM *p = BN_dup(DH_get0_p(dh));
+  BIGNUM *q = q_old == NULL ? NULL : BN_dup(q_old);
+  BIGNUM *g = BN_dup(DH_get0_g(dh));
+  if (p == NULL || (q_old != NULL && q == NULL) || g == NULL ||
+      !DH_set0_pqg(to->pkey, p, q, g)) {
+    BN_free(p);
+    BN_free(q);
+    BN_free(g);
+    return 0;
+  }
+
+  // |DH_set0_pqg| took ownership of |p|, |q|, and |g|.
+  return 1;
+}
+
+static int dh_param_cmp(const EVP_PKEY *a, const EVP_PKEY *b) {
+  if (dh_param_missing(a) || dh_param_missing(b)) {
+    return -2;
+  }
+
+  // Matching OpenSSL, only compare p and g for PKCS#3-style Diffie-Hellman.
+  // OpenSSL only checks q in X9.42-style Diffie-Hellman ("DHX").
+  const DH *a_dh = a->pkey;
+  const DH *b_dh = b->pkey;
+  return BN_cmp(DH_get0_p(a_dh), DH_get0_p(b_dh)) == 0 &&
+         BN_cmp(DH_get0_g(a_dh), DH_get0_g(b_dh)) == 0;
+}
+
+static int dh_pub_cmp(const EVP_PKEY *a, const EVP_PKEY *b) {
+  if (dh_param_cmp(a, b) <= 0) {
+    return 0;
+  }
+
+  const DH *a_dh = a->pkey;
+  const DH *b_dh = b->pkey;
+  return BN_cmp(DH_get0_pub_key(a_dh), DH_get0_pub_key(b_dh)) == 0;
+}
+
+const EVP_PKEY_ASN1_METHOD dh_asn1_meth = {
+    .pkey_id = EVP_PKEY_DH,
+    .pkey_method = &dh_pkey_meth,
+    .pub_cmp = dh_pub_cmp,
+    .pkey_size = dh_size,
+    .pkey_bits = dh_bits,
+    .param_missing = dh_param_missing,
+    .param_copy = dh_param_copy,
+    .param_cmp = dh_param_cmp,
+    .pkey_free = dh_free,
+};
+
+int EVP_PKEY_set1_DH(EVP_PKEY *pkey, DH *key) {
+  if (EVP_PKEY_assign_DH(pkey, key)) {
+    DH_up_ref(key);
+    return 1;
+  }
+  return 0;
+}
+
+int EVP_PKEY_assign_DH(EVP_PKEY *pkey, DH *key) {
+  evp_pkey_set_method(pkey, &dh_asn1_meth);
+  pkey->pkey = key;
+  return key != NULL;
+}
+
+DH *EVP_PKEY_get0_DH(const EVP_PKEY *pkey) {
+  if (pkey->type != EVP_PKEY_DH) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_A_DH_KEY);
+    return NULL;
+  }
+  return pkey->pkey;
+}
+
+DH *EVP_PKEY_get1_DH(const EVP_PKEY *pkey) {
+  DH *dh = EVP_PKEY_get0_DH(pkey);
+  if (dh != NULL) {
+    DH_up_ref(dh);
+  }
+  return dh;
+}
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_dsa_asn1.c b/Sources/CCryptoBoringSSL/crypto/evp/p_dsa_asn1.c
index 02dc36ad..e7d13bba 100644
--- a/Sources/CCryptoBoringSSL/crypto/evp/p_dsa_asn1.c
+++ b/Sources/CCryptoBoringSSL/crypto/evp/p_dsa_asn1.c
@@ -306,3 +306,33 @@ int EVP_PKEY_CTX_set_dsa_paramgen_q_bits(EVP_PKEY_CTX *ctx, int qbits) {
   OPENSSL_PUT_ERROR(EVP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
   return 0;
 }
+
+int EVP_PKEY_set1_DSA(EVP_PKEY *pkey, DSA *key) {
+  if (EVP_PKEY_assign_DSA(pkey, key)) {
+    DSA_up_ref(key);
+    return 1;
+  }
+  return 0;
+}
+
+int EVP_PKEY_assign_DSA(EVP_PKEY *pkey, DSA *key) {
+  evp_pkey_set_method(pkey, &dsa_asn1_meth);
+  pkey->pkey = key;
+  return key != NULL;
+}
+
+DSA *EVP_PKEY_get0_DSA(const EVP_PKEY *pkey) {
+  if (pkey->type != EVP_PKEY_DSA) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_A_DSA_KEY);
+    return NULL;
+  }
+  return pkey->pkey;
+}
+
+DSA *EVP_PKEY_get1_DSA(const EVP_PKEY *pkey) {
+  DSA *dsa = EVP_PKEY_get0_DSA(pkey);
+  if (dsa != NULL) {
+    DSA_up_ref(dsa);
+  }
+  return dsa;
+}
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_ec.c b/Sources/CCryptoBoringSSL/crypto/evp/p_ec.c
index cd523a9e..3cebd638 100644
--- a/Sources/CCryptoBoringSSL/crypto/evp/p_ec.c
+++ b/Sources/CCryptoBoringSSL/crypto/evp/p_ec.c
@@ -90,15 +90,14 @@ static int pkey_ec_init(EVP_PKEY_CTX *ctx) {
 }
 
 static int pkey_ec_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) {
-  EC_PKEY_CTX *dctx, *sctx;
   if (!pkey_ec_init(dst)) {
     return 0;
   }
-  sctx = src->data;
-  dctx = dst->data;
 
+  const EC_PKEY_CTX *sctx = src->data;
+  EC_PKEY_CTX *dctx = dst->data;
   dctx->md = sctx->md;
-
+  dctx->gen_group = sctx->gen_group;
   return 1;
 }
 
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_ec_asn1.c b/Sources/CCryptoBoringSSL/crypto/evp/p_ec_asn1.c
index 2724e590..2682cf8d 100644
--- a/Sources/CCryptoBoringSSL/crypto/evp/p_ec_asn1.c
+++ b/Sources/CCryptoBoringSSL/crypto/evp/p_ec_asn1.c
@@ -300,3 +300,33 @@ const EVP_PKEY_ASN1_METHOD ec_asn1_meth = {
 
     int_ec_free,
 };
+
+int EVP_PKEY_set1_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) {
+  if (EVP_PKEY_assign_EC_KEY(pkey, key)) {
+    EC_KEY_up_ref(key);
+    return 1;
+  }
+  return 0;
+}
+
+int EVP_PKEY_assign_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) {
+  evp_pkey_set_method(pkey, &ec_asn1_meth);
+  pkey->pkey = key;
+  return key != NULL;
+}
+
+EC_KEY *EVP_PKEY_get0_EC_KEY(const EVP_PKEY *pkey) {
+  if (pkey->type != EVP_PKEY_EC) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_EC_KEY_KEY);
+    return NULL;
+  }
+  return pkey->pkey;
+}
+
+EC_KEY *EVP_PKEY_get1_EC_KEY(const EVP_PKEY *pkey) {
+  EC_KEY *ec_key = EVP_PKEY_get0_EC_KEY(pkey);
+  if (ec_key != NULL) {
+    EC_KEY_up_ref(ec_key);
+  }
+  return ec_key;
+}
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_ed25519.c b/Sources/CCryptoBoringSSL/crypto/evp/p_ed25519.c
index ffae3ca6..6aabdd1b 100644
--- a/Sources/CCryptoBoringSSL/crypto/evp/p_ed25519.c
+++ b/Sources/CCryptoBoringSSL/crypto/evp/p_ed25519.c
@@ -30,10 +30,7 @@ static int pkey_ed25519_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) {
     return 0;
   }
 
-  if (!EVP_PKEY_set_type(pkey, EVP_PKEY_ED25519)) {
-    OPENSSL_free(key);
-    return 0;
-  }
+  evp_pkey_set_method(pkey, &ed25519_asn1_meth);
 
   uint8_t pubkey_unused[32];
   ED25519_keypair(pubkey_unused, key->key);
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_rsa_asn1.c b/Sources/CCryptoBoringSSL/crypto/evp/p_rsa_asn1.c
index 72843ee5..fcba3e9f 100644
--- a/Sources/CCryptoBoringSSL/crypto/evp/p_rsa_asn1.c
+++ b/Sources/CCryptoBoringSSL/crypto/evp/p_rsa_asn1.c
@@ -209,3 +209,33 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meth = {
 
     int_rsa_free,
 };
+
+int EVP_PKEY_set1_RSA(EVP_PKEY *pkey, RSA *key) {
+  if (EVP_PKEY_assign_RSA(pkey, key)) {
+    RSA_up_ref(key);
+    return 1;
+  }
+  return 0;
+}
+
+int EVP_PKEY_assign_RSA(EVP_PKEY *pkey, RSA *key) {
+  evp_pkey_set_method(pkey, &rsa_asn1_meth);
+  pkey->pkey = key;
+  return key != NULL;
+}
+
+RSA *EVP_PKEY_get0_RSA(const EVP_PKEY *pkey) {
+  if (pkey->type != EVP_PKEY_RSA) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_RSA_KEY);
+    return NULL;
+  }
+  return pkey->pkey;
+}
+
+RSA *EVP_PKEY_get1_RSA(const EVP_PKEY *pkey) {
+  RSA *rsa = EVP_PKEY_get0_RSA(pkey);
+  if (rsa != NULL) {
+    RSA_up_ref(rsa);
+  }
+  return rsa;
+}
diff --git a/Sources/CCryptoBoringSSL/crypto/evp/p_x25519.c b/Sources/CCryptoBoringSSL/crypto/evp/p_x25519.c
index 4e1147b3..7762d9e6 100644
--- a/Sources/CCryptoBoringSSL/crypto/evp/p_x25519.c
+++ b/Sources/CCryptoBoringSSL/crypto/evp/p_x25519.c
@@ -30,10 +30,7 @@ static int pkey_x25519_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) {
     return 0;
   }
 
-  if (!EVP_PKEY_set_type(pkey, EVP_PKEY_X25519)) {
-    OPENSSL_free(key);
-    return 0;
-  }
+  evp_pkey_set_method(pkey, &x25519_asn1_meth);
 
   X25519_keypair(key->pub, key->priv);
   key->has_private = 1;
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c.inc
similarity index 85%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c.inc
index ade87963..0b978d4d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes.c.inc
@@ -104,3 +104,24 @@ int AES_set_decrypt_key(const uint8_t *key, unsigned bits, AES_KEY *aeskey) {
     return aes_nohw_set_decrypt_key(key, bits, aeskey);
   }
 }
+
+#if defined(HWAES) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
+// On x86 and x86_64, |aes_hw_set_decrypt_key|, we implement
+// |aes_hw_encrypt_key_to_decrypt_key| in assembly and rely on C code to combine
+// the operations.
+int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) {
+  int ret = aes_hw_set_encrypt_key(user_key, bits, key);
+  if (ret == 0) {
+    aes_hw_encrypt_key_to_decrypt_key(key);
+  }
+  return ret;
+}
+
+int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) {
+  if (aes_hw_set_encrypt_key_alt_preferred()) {
+    return aes_hw_set_encrypt_key_alt(user_key, bits, key);
+  } else {
+    return aes_hw_set_encrypt_key_base(user_key, bits, key);
+  }
+}
+#endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes_nohw.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes_nohw.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes_nohw.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/aes_nohw.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/internal.h
index 98b2a14d..b4990957 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/internal.h
@@ -17,6 +17,8 @@
 
 #include <stdlib.h>
 
+#include <CCryptoBoringSSL_aes.h>
+
 #include "../../internal.h"
 
 #if defined(__cplusplus)
@@ -66,17 +68,41 @@ OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); }
 
 #if defined(HWAES)
 
-int aes_hw_set_encrypt_key(const uint8_t *user_key, const int bits,
-                           AES_KEY *key);
-int aes_hw_set_decrypt_key(const uint8_t *user_key, const int bits,
-                           AES_KEY *key);
+int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, AES_KEY *key);
+int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, AES_KEY *key);
 void aes_hw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
 void aes_hw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
 void aes_hw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
-                        const AES_KEY *key, uint8_t *ivec, const int enc);
+                        const AES_KEY *key, uint8_t *ivec, int enc);
 void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
                                  const AES_KEY *key, const uint8_t ivec[16]);
 
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+// On x86 and x86_64, |aes_hw_set_decrypt_key| is implemented in terms of
+// |aes_hw_set_encrypt_key| and a conversion function.
+void aes_hw_encrypt_key_to_decrypt_key(AES_KEY *key);
+
+// There are two variants of this function, one which uses aeskeygenassist
+// ("base") and one which uses aesenclast + pshufb ("alt"). aesenclast is
+// overall faster but is slower on some older processors. It doesn't use AVX,
+// but AVX is used as a proxy to detecting this. See
+// https://groups.google.com/g/mailing.openssl.dev/c/OuFXwW4NfO8/m/7d2ZXVjkxVkJ
+//
+// TODO(davidben): It is unclear if the aeskeygenassist version is still
+// worthwhile. However, the aesenclast version requires SSSE3. SSSE3 long
+// predates AES-NI, but it's not clear if AES-NI implies SSSE3. In OpenSSL, the
+// CCM AES-NI assembly seems to assume it does.
+OPENSSL_INLINE int aes_hw_set_encrypt_key_alt_capable(void) {
+  return hwaes_capable() && CRYPTO_is_SSSE3_capable();
+}
+OPENSSL_INLINE int aes_hw_set_encrypt_key_alt_preferred(void) {
+  return hwaes_capable() && CRYPTO_is_AVX_capable();
+}
+int aes_hw_set_encrypt_key_base(const uint8_t *user_key, int bits,
+                                AES_KEY *key);
+int aes_hw_set_encrypt_key_alt(const uint8_t *user_key, int bits, AES_KEY *key);
+#endif  // OPENSSL_X86 || OPENSSL_X86_64
+
 #else
 
 // If HWAES isn't defined then we provide dummy functions for each of the hwaes
@@ -120,7 +146,7 @@ OPENSSL_INLINE void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
 
 #if defined(HWAES_ECB)
 void aes_hw_ecb_encrypt(const uint8_t *in, uint8_t *out, size_t length,
-                        const AES_KEY *key, const int enc);
+                        const AES_KEY *key, int enc);
 #endif  // HWAES_ECB
 
 
@@ -218,7 +244,7 @@ void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
                                    size_t blocks, const AES_KEY *key,
                                    const uint8_t ivec[16]);
 void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len,
-                          const AES_KEY *key, uint8_t *ivec, const int enc);
+                          const AES_KEY *key, uint8_t *ivec, int enc);
 
 
 #if defined(__cplusplus)
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/key_wrap.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/key_wrap.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/key_wrap.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/key_wrap.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/mode_wrappers.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/mode_wrappers.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/mode_wrappers.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/aes/mode_wrappers.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-windows.windows.x86.S
deleted file mode 100644
index cf3ab235..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-windows.windows.x86.S
+++ /dev/null
@@ -1,2473 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-;extern	_OPENSSL_ia32cap_P
-%ifdef BORINGSSL_DISPATCH_TEST
-extern	_BORINGSSL_function_hit
-%endif
-global	_aes_hw_encrypt
-align	16
-_aes_hw_encrypt:
-L$_aes_hw_encrypt_begin:
-%ifdef BORINGSSL_DISPATCH_TEST
-	push	ebx
-	push	edx
-	call	L$000pic
-L$000pic:
-	pop	ebx
-	lea	ebx,[(_BORINGSSL_function_hit+1-L$000pic)+ebx]
-	mov	edx,1
-	mov	BYTE [ebx],dl
-	pop	edx
-	pop	ebx
-%endif
-	mov	eax,DWORD [4+esp]
-	mov	edx,DWORD [12+esp]
-	movups	xmm2,[eax]
-	mov	ecx,DWORD [240+edx]
-	mov	eax,DWORD [8+esp]
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$001enc1_loop_1:
-db	102,15,56,220,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$001enc1_loop_1
-db	102,15,56,221,209
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	movups	[eax],xmm2
-	pxor	xmm2,xmm2
-	ret
-global	_aes_hw_decrypt
-align	16
-_aes_hw_decrypt:
-L$_aes_hw_decrypt_begin:
-	mov	eax,DWORD [4+esp]
-	mov	edx,DWORD [12+esp]
-	movups	xmm2,[eax]
-	mov	ecx,DWORD [240+edx]
-	mov	eax,DWORD [8+esp]
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$002dec1_loop_2:
-db	102,15,56,222,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$002dec1_loop_2
-db	102,15,56,223,209
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	movups	[eax],xmm2
-	pxor	xmm2,xmm2
-	ret
-align	16
-__aesni_encrypt2:
-	movups	xmm0,[edx]
-	shl	ecx,4
-	movups	xmm1,[16+edx]
-	xorps	xmm2,xmm0
-	pxor	xmm3,xmm0
-	movups	xmm0,[32+edx]
-	lea	edx,[32+ecx*1+edx]
-	neg	ecx
-	add	ecx,16
-L$003enc2_loop:
-db	102,15,56,220,209
-db	102,15,56,220,217
-	movups	xmm1,[ecx*1+edx]
-	add	ecx,32
-db	102,15,56,220,208
-db	102,15,56,220,216
-	movups	xmm0,[ecx*1+edx-16]
-	jnz	NEAR L$003enc2_loop
-db	102,15,56,220,209
-db	102,15,56,220,217
-db	102,15,56,221,208
-db	102,15,56,221,216
-	ret
-align	16
-__aesni_decrypt2:
-	movups	xmm0,[edx]
-	shl	ecx,4
-	movups	xmm1,[16+edx]
-	xorps	xmm2,xmm0
-	pxor	xmm3,xmm0
-	movups	xmm0,[32+edx]
-	lea	edx,[32+ecx*1+edx]
-	neg	ecx
-	add	ecx,16
-L$004dec2_loop:
-db	102,15,56,222,209
-db	102,15,56,222,217
-	movups	xmm1,[ecx*1+edx]
-	add	ecx,32
-db	102,15,56,222,208
-db	102,15,56,222,216
-	movups	xmm0,[ecx*1+edx-16]
-	jnz	NEAR L$004dec2_loop
-db	102,15,56,222,209
-db	102,15,56,222,217
-db	102,15,56,223,208
-db	102,15,56,223,216
-	ret
-align	16
-__aesni_encrypt3:
-	movups	xmm0,[edx]
-	shl	ecx,4
-	movups	xmm1,[16+edx]
-	xorps	xmm2,xmm0
-	pxor	xmm3,xmm0
-	pxor	xmm4,xmm0
-	movups	xmm0,[32+edx]
-	lea	edx,[32+ecx*1+edx]
-	neg	ecx
-	add	ecx,16
-L$005enc3_loop:
-db	102,15,56,220,209
-db	102,15,56,220,217
-db	102,15,56,220,225
-	movups	xmm1,[ecx*1+edx]
-	add	ecx,32
-db	102,15,56,220,208
-db	102,15,56,220,216
-db	102,15,56,220,224
-	movups	xmm0,[ecx*1+edx-16]
-	jnz	NEAR L$005enc3_loop
-db	102,15,56,220,209
-db	102,15,56,220,217
-db	102,15,56,220,225
-db	102,15,56,221,208
-db	102,15,56,221,216
-db	102,15,56,221,224
-	ret
-align	16
-__aesni_decrypt3:
-	movups	xmm0,[edx]
-	shl	ecx,4
-	movups	xmm1,[16+edx]
-	xorps	xmm2,xmm0
-	pxor	xmm3,xmm0
-	pxor	xmm4,xmm0
-	movups	xmm0,[32+edx]
-	lea	edx,[32+ecx*1+edx]
-	neg	ecx
-	add	ecx,16
-L$006dec3_loop:
-db	102,15,56,222,209
-db	102,15,56,222,217
-db	102,15,56,222,225
-	movups	xmm1,[ecx*1+edx]
-	add	ecx,32
-db	102,15,56,222,208
-db	102,15,56,222,216
-db	102,15,56,222,224
-	movups	xmm0,[ecx*1+edx-16]
-	jnz	NEAR L$006dec3_loop
-db	102,15,56,222,209
-db	102,15,56,222,217
-db	102,15,56,222,225
-db	102,15,56,223,208
-db	102,15,56,223,216
-db	102,15,56,223,224
-	ret
-align	16
-__aesni_encrypt4:
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	shl	ecx,4
-	xorps	xmm2,xmm0
-	pxor	xmm3,xmm0
-	pxor	xmm4,xmm0
-	pxor	xmm5,xmm0
-	movups	xmm0,[32+edx]
-	lea	edx,[32+ecx*1+edx]
-	neg	ecx
-db	15,31,64,0
-	add	ecx,16
-L$007enc4_loop:
-db	102,15,56,220,209
-db	102,15,56,220,217
-db	102,15,56,220,225
-db	102,15,56,220,233
-	movups	xmm1,[ecx*1+edx]
-	add	ecx,32
-db	102,15,56,220,208
-db	102,15,56,220,216
-db	102,15,56,220,224
-db	102,15,56,220,232
-	movups	xmm0,[ecx*1+edx-16]
-	jnz	NEAR L$007enc4_loop
-db	102,15,56,220,209
-db	102,15,56,220,217
-db	102,15,56,220,225
-db	102,15,56,220,233
-db	102,15,56,221,208
-db	102,15,56,221,216
-db	102,15,56,221,224
-db	102,15,56,221,232
-	ret
-align	16
-__aesni_decrypt4:
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	shl	ecx,4
-	xorps	xmm2,xmm0
-	pxor	xmm3,xmm0
-	pxor	xmm4,xmm0
-	pxor	xmm5,xmm0
-	movups	xmm0,[32+edx]
-	lea	edx,[32+ecx*1+edx]
-	neg	ecx
-db	15,31,64,0
-	add	ecx,16
-L$008dec4_loop:
-db	102,15,56,222,209
-db	102,15,56,222,217
-db	102,15,56,222,225
-db	102,15,56,222,233
-	movups	xmm1,[ecx*1+edx]
-	add	ecx,32
-db	102,15,56,222,208
-db	102,15,56,222,216
-db	102,15,56,222,224
-db	102,15,56,222,232
-	movups	xmm0,[ecx*1+edx-16]
-	jnz	NEAR L$008dec4_loop
-db	102,15,56,222,209
-db	102,15,56,222,217
-db	102,15,56,222,225
-db	102,15,56,222,233
-db	102,15,56,223,208
-db	102,15,56,223,216
-db	102,15,56,223,224
-db	102,15,56,223,232
-	ret
-align	16
-__aesni_encrypt6:
-	movups	xmm0,[edx]
-	shl	ecx,4
-	movups	xmm1,[16+edx]
-	xorps	xmm2,xmm0
-	pxor	xmm3,xmm0
-	pxor	xmm4,xmm0
-db	102,15,56,220,209
-	pxor	xmm5,xmm0
-	pxor	xmm6,xmm0
-db	102,15,56,220,217
-	lea	edx,[32+ecx*1+edx]
-	neg	ecx
-db	102,15,56,220,225
-	pxor	xmm7,xmm0
-	movups	xmm0,[ecx*1+edx]
-	add	ecx,16
-	jmp	NEAR L$009_aesni_encrypt6_inner
-align	16
-L$010enc6_loop:
-db	102,15,56,220,209
-db	102,15,56,220,217
-db	102,15,56,220,225
-L$009_aesni_encrypt6_inner:
-db	102,15,56,220,233
-db	102,15,56,220,241
-db	102,15,56,220,249
-L$_aesni_encrypt6_enter:
-	movups	xmm1,[ecx*1+edx]
-	add	ecx,32
-db	102,15,56,220,208
-db	102,15,56,220,216
-db	102,15,56,220,224
-db	102,15,56,220,232
-db	102,15,56,220,240
-db	102,15,56,220,248
-	movups	xmm0,[ecx*1+edx-16]
-	jnz	NEAR L$010enc6_loop
-db	102,15,56,220,209
-db	102,15,56,220,217
-db	102,15,56,220,225
-db	102,15,56,220,233
-db	102,15,56,220,241
-db	102,15,56,220,249
-db	102,15,56,221,208
-db	102,15,56,221,216
-db	102,15,56,221,224
-db	102,15,56,221,232
-db	102,15,56,221,240
-db	102,15,56,221,248
-	ret
-align	16
-__aesni_decrypt6:
-	movups	xmm0,[edx]
-	shl	ecx,4
-	movups	xmm1,[16+edx]
-	xorps	xmm2,xmm0
-	pxor	xmm3,xmm0
-	pxor	xmm4,xmm0
-db	102,15,56,222,209
-	pxor	xmm5,xmm0
-	pxor	xmm6,xmm0
-db	102,15,56,222,217
-	lea	edx,[32+ecx*1+edx]
-	neg	ecx
-db	102,15,56,222,225
-	pxor	xmm7,xmm0
-	movups	xmm0,[ecx*1+edx]
-	add	ecx,16
-	jmp	NEAR L$011_aesni_decrypt6_inner
-align	16
-L$012dec6_loop:
-db	102,15,56,222,209
-db	102,15,56,222,217
-db	102,15,56,222,225
-L$011_aesni_decrypt6_inner:
-db	102,15,56,222,233
-db	102,15,56,222,241
-db	102,15,56,222,249
-L$_aesni_decrypt6_enter:
-	movups	xmm1,[ecx*1+edx]
-	add	ecx,32
-db	102,15,56,222,208
-db	102,15,56,222,216
-db	102,15,56,222,224
-db	102,15,56,222,232
-db	102,15,56,222,240
-db	102,15,56,222,248
-	movups	xmm0,[ecx*1+edx-16]
-	jnz	NEAR L$012dec6_loop
-db	102,15,56,222,209
-db	102,15,56,222,217
-db	102,15,56,222,225
-db	102,15,56,222,233
-db	102,15,56,222,241
-db	102,15,56,222,249
-db	102,15,56,223,208
-db	102,15,56,223,216
-db	102,15,56,223,224
-db	102,15,56,223,232
-db	102,15,56,223,240
-db	102,15,56,223,248
-	ret
-global	_aes_hw_ecb_encrypt
-align	16
-_aes_hw_ecb_encrypt:
-L$_aes_hw_ecb_encrypt_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	edx,DWORD [32+esp]
-	mov	ebx,DWORD [36+esp]
-	and	eax,-16
-	jz	NEAR L$013ecb_ret
-	mov	ecx,DWORD [240+edx]
-	test	ebx,ebx
-	jz	NEAR L$014ecb_decrypt
-	mov	ebp,edx
-	mov	ebx,ecx
-	cmp	eax,96
-	jb	NEAR L$015ecb_enc_tail
-	movdqu	xmm2,[esi]
-	movdqu	xmm3,[16+esi]
-	movdqu	xmm4,[32+esi]
-	movdqu	xmm5,[48+esi]
-	movdqu	xmm6,[64+esi]
-	movdqu	xmm7,[80+esi]
-	lea	esi,[96+esi]
-	sub	eax,96
-	jmp	NEAR L$016ecb_enc_loop6_enter
-align	16
-L$017ecb_enc_loop6:
-	movups	[edi],xmm2
-	movdqu	xmm2,[esi]
-	movups	[16+edi],xmm3
-	movdqu	xmm3,[16+esi]
-	movups	[32+edi],xmm4
-	movdqu	xmm4,[32+esi]
-	movups	[48+edi],xmm5
-	movdqu	xmm5,[48+esi]
-	movups	[64+edi],xmm6
-	movdqu	xmm6,[64+esi]
-	movups	[80+edi],xmm7
-	lea	edi,[96+edi]
-	movdqu	xmm7,[80+esi]
-	lea	esi,[96+esi]
-L$016ecb_enc_loop6_enter:
-	call	__aesni_encrypt6
-	mov	edx,ebp
-	mov	ecx,ebx
-	sub	eax,96
-	jnc	NEAR L$017ecb_enc_loop6
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-	movups	[64+edi],xmm6
-	movups	[80+edi],xmm7
-	lea	edi,[96+edi]
-	add	eax,96
-	jz	NEAR L$013ecb_ret
-L$015ecb_enc_tail:
-	movups	xmm2,[esi]
-	cmp	eax,32
-	jb	NEAR L$018ecb_enc_one
-	movups	xmm3,[16+esi]
-	je	NEAR L$019ecb_enc_two
-	movups	xmm4,[32+esi]
-	cmp	eax,64
-	jb	NEAR L$020ecb_enc_three
-	movups	xmm5,[48+esi]
-	je	NEAR L$021ecb_enc_four
-	movups	xmm6,[64+esi]
-	xorps	xmm7,xmm7
-	call	__aesni_encrypt6
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-	movups	[64+edi],xmm6
-	jmp	NEAR L$013ecb_ret
-align	16
-L$018ecb_enc_one:
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$022enc1_loop_3:
-db	102,15,56,220,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$022enc1_loop_3
-db	102,15,56,221,209
-	movups	[edi],xmm2
-	jmp	NEAR L$013ecb_ret
-align	16
-L$019ecb_enc_two:
-	call	__aesni_encrypt2
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	jmp	NEAR L$013ecb_ret
-align	16
-L$020ecb_enc_three:
-	call	__aesni_encrypt3
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	jmp	NEAR L$013ecb_ret
-align	16
-L$021ecb_enc_four:
-	call	__aesni_encrypt4
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-	jmp	NEAR L$013ecb_ret
-align	16
-L$014ecb_decrypt:
-	mov	ebp,edx
-	mov	ebx,ecx
-	cmp	eax,96
-	jb	NEAR L$023ecb_dec_tail
-	movdqu	xmm2,[esi]
-	movdqu	xmm3,[16+esi]
-	movdqu	xmm4,[32+esi]
-	movdqu	xmm5,[48+esi]
-	movdqu	xmm6,[64+esi]
-	movdqu	xmm7,[80+esi]
-	lea	esi,[96+esi]
-	sub	eax,96
-	jmp	NEAR L$024ecb_dec_loop6_enter
-align	16
-L$025ecb_dec_loop6:
-	movups	[edi],xmm2
-	movdqu	xmm2,[esi]
-	movups	[16+edi],xmm3
-	movdqu	xmm3,[16+esi]
-	movups	[32+edi],xmm4
-	movdqu	xmm4,[32+esi]
-	movups	[48+edi],xmm5
-	movdqu	xmm5,[48+esi]
-	movups	[64+edi],xmm6
-	movdqu	xmm6,[64+esi]
-	movups	[80+edi],xmm7
-	lea	edi,[96+edi]
-	movdqu	xmm7,[80+esi]
-	lea	esi,[96+esi]
-L$024ecb_dec_loop6_enter:
-	call	__aesni_decrypt6
-	mov	edx,ebp
-	mov	ecx,ebx
-	sub	eax,96
-	jnc	NEAR L$025ecb_dec_loop6
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-	movups	[64+edi],xmm6
-	movups	[80+edi],xmm7
-	lea	edi,[96+edi]
-	add	eax,96
-	jz	NEAR L$013ecb_ret
-L$023ecb_dec_tail:
-	movups	xmm2,[esi]
-	cmp	eax,32
-	jb	NEAR L$026ecb_dec_one
-	movups	xmm3,[16+esi]
-	je	NEAR L$027ecb_dec_two
-	movups	xmm4,[32+esi]
-	cmp	eax,64
-	jb	NEAR L$028ecb_dec_three
-	movups	xmm5,[48+esi]
-	je	NEAR L$029ecb_dec_four
-	movups	xmm6,[64+esi]
-	xorps	xmm7,xmm7
-	call	__aesni_decrypt6
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-	movups	[64+edi],xmm6
-	jmp	NEAR L$013ecb_ret
-align	16
-L$026ecb_dec_one:
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$030dec1_loop_4:
-db	102,15,56,222,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$030dec1_loop_4
-db	102,15,56,223,209
-	movups	[edi],xmm2
-	jmp	NEAR L$013ecb_ret
-align	16
-L$027ecb_dec_two:
-	call	__aesni_decrypt2
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	jmp	NEAR L$013ecb_ret
-align	16
-L$028ecb_dec_three:
-	call	__aesni_decrypt3
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	jmp	NEAR L$013ecb_ret
-align	16
-L$029ecb_dec_four:
-	call	__aesni_decrypt4
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-L$013ecb_ret:
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	pxor	xmm4,xmm4
-	pxor	xmm5,xmm5
-	pxor	xmm6,xmm6
-	pxor	xmm7,xmm7
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_aes_hw_ccm64_encrypt_blocks
-align	16
-_aes_hw_ccm64_encrypt_blocks:
-L$_aes_hw_ccm64_encrypt_blocks_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	edx,DWORD [32+esp]
-	mov	ebx,DWORD [36+esp]
-	mov	ecx,DWORD [40+esp]
-	mov	ebp,esp
-	sub	esp,60
-	and	esp,-16
-	mov	DWORD [48+esp],ebp
-	movdqu	xmm7,[ebx]
-	movdqu	xmm3,[ecx]
-	mov	ecx,DWORD [240+edx]
-	mov	DWORD [esp],202182159
-	mov	DWORD [4+esp],134810123
-	mov	DWORD [8+esp],67438087
-	mov	DWORD [12+esp],66051
-	mov	ebx,1
-	xor	ebp,ebp
-	mov	DWORD [16+esp],ebx
-	mov	DWORD [20+esp],ebp
-	mov	DWORD [24+esp],ebp
-	mov	DWORD [28+esp],ebp
-	shl	ecx,4
-	mov	ebx,16
-	lea	ebp,[edx]
-	movdqa	xmm5,[esp]
-	movdqa	xmm2,xmm7
-	lea	edx,[32+ecx*1+edx]
-	sub	ebx,ecx
-db	102,15,56,0,253
-L$031ccm64_enc_outer:
-	movups	xmm0,[ebp]
-	mov	ecx,ebx
-	movups	xmm6,[esi]
-	xorps	xmm2,xmm0
-	movups	xmm1,[16+ebp]
-	xorps	xmm0,xmm6
-	xorps	xmm3,xmm0
-	movups	xmm0,[32+ebp]
-L$032ccm64_enc2_loop:
-db	102,15,56,220,209
-db	102,15,56,220,217
-	movups	xmm1,[ecx*1+edx]
-	add	ecx,32
-db	102,15,56,220,208
-db	102,15,56,220,216
-	movups	xmm0,[ecx*1+edx-16]
-	jnz	NEAR L$032ccm64_enc2_loop
-db	102,15,56,220,209
-db	102,15,56,220,217
-	paddq	xmm7,[16+esp]
-	dec	eax
-db	102,15,56,221,208
-db	102,15,56,221,216
-	lea	esi,[16+esi]
-	xorps	xmm6,xmm2
-	movdqa	xmm2,xmm7
-	movups	[edi],xmm6
-db	102,15,56,0,213
-	lea	edi,[16+edi]
-	jnz	NEAR L$031ccm64_enc_outer
-	mov	esp,DWORD [48+esp]
-	mov	edi,DWORD [40+esp]
-	movups	[edi],xmm3
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	pxor	xmm4,xmm4
-	pxor	xmm5,xmm5
-	pxor	xmm6,xmm6
-	pxor	xmm7,xmm7
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_aes_hw_ccm64_decrypt_blocks
-align	16
-_aes_hw_ccm64_decrypt_blocks:
-L$_aes_hw_ccm64_decrypt_blocks_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	edx,DWORD [32+esp]
-	mov	ebx,DWORD [36+esp]
-	mov	ecx,DWORD [40+esp]
-	mov	ebp,esp
-	sub	esp,60
-	and	esp,-16
-	mov	DWORD [48+esp],ebp
-	movdqu	xmm7,[ebx]
-	movdqu	xmm3,[ecx]
-	mov	ecx,DWORD [240+edx]
-	mov	DWORD [esp],202182159
-	mov	DWORD [4+esp],134810123
-	mov	DWORD [8+esp],67438087
-	mov	DWORD [12+esp],66051
-	mov	ebx,1
-	xor	ebp,ebp
-	mov	DWORD [16+esp],ebx
-	mov	DWORD [20+esp],ebp
-	mov	DWORD [24+esp],ebp
-	mov	DWORD [28+esp],ebp
-	movdqa	xmm5,[esp]
-	movdqa	xmm2,xmm7
-	mov	ebp,edx
-	mov	ebx,ecx
-db	102,15,56,0,253
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$033enc1_loop_5:
-db	102,15,56,220,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$033enc1_loop_5
-db	102,15,56,221,209
-	shl	ebx,4
-	mov	ecx,16
-	movups	xmm6,[esi]
-	paddq	xmm7,[16+esp]
-	lea	esi,[16+esi]
-	sub	ecx,ebx
-	lea	edx,[32+ebx*1+ebp]
-	mov	ebx,ecx
-	jmp	NEAR L$034ccm64_dec_outer
-align	16
-L$034ccm64_dec_outer:
-	xorps	xmm6,xmm2
-	movdqa	xmm2,xmm7
-	movups	[edi],xmm6
-	lea	edi,[16+edi]
-db	102,15,56,0,213
-	sub	eax,1
-	jz	NEAR L$035ccm64_dec_break
-	movups	xmm0,[ebp]
-	mov	ecx,ebx
-	movups	xmm1,[16+ebp]
-	xorps	xmm6,xmm0
-	xorps	xmm2,xmm0
-	xorps	xmm3,xmm6
-	movups	xmm0,[32+ebp]
-L$036ccm64_dec2_loop:
-db	102,15,56,220,209
-db	102,15,56,220,217
-	movups	xmm1,[ecx*1+edx]
-	add	ecx,32
-db	102,15,56,220,208
-db	102,15,56,220,216
-	movups	xmm0,[ecx*1+edx-16]
-	jnz	NEAR L$036ccm64_dec2_loop
-	movups	xmm6,[esi]
-	paddq	xmm7,[16+esp]
-db	102,15,56,220,209
-db	102,15,56,220,217
-db	102,15,56,221,208
-db	102,15,56,221,216
-	lea	esi,[16+esi]
-	jmp	NEAR L$034ccm64_dec_outer
-align	16
-L$035ccm64_dec_break:
-	mov	ecx,DWORD [240+ebp]
-	mov	edx,ebp
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	xorps	xmm6,xmm0
-	lea	edx,[32+edx]
-	xorps	xmm3,xmm6
-L$037enc1_loop_6:
-db	102,15,56,220,217
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$037enc1_loop_6
-db	102,15,56,221,217
-	mov	esp,DWORD [48+esp]
-	mov	edi,DWORD [40+esp]
-	movups	[edi],xmm3
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	pxor	xmm4,xmm4
-	pxor	xmm5,xmm5
-	pxor	xmm6,xmm6
-	pxor	xmm7,xmm7
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_aes_hw_ctr32_encrypt_blocks
-align	16
-_aes_hw_ctr32_encrypt_blocks:
-L$_aes_hw_ctr32_encrypt_blocks_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-%ifdef BORINGSSL_DISPATCH_TEST
-	push	ebx
-	push	edx
-	call	L$038pic
-L$038pic:
-	pop	ebx
-	lea	ebx,[(_BORINGSSL_function_hit+0-L$038pic)+ebx]
-	mov	edx,1
-	mov	BYTE [ebx],dl
-	pop	edx
-	pop	ebx
-%endif
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	edx,DWORD [32+esp]
-	mov	ebx,DWORD [36+esp]
-	mov	ebp,esp
-	sub	esp,88
-	and	esp,-16
-	mov	DWORD [80+esp],ebp
-	cmp	eax,1
-	je	NEAR L$039ctr32_one_shortcut
-	movdqu	xmm7,[ebx]
-	mov	DWORD [esp],202182159
-	mov	DWORD [4+esp],134810123
-	mov	DWORD [8+esp],67438087
-	mov	DWORD [12+esp],66051
-	mov	ecx,6
-	xor	ebp,ebp
-	mov	DWORD [16+esp],ecx
-	mov	DWORD [20+esp],ecx
-	mov	DWORD [24+esp],ecx
-	mov	DWORD [28+esp],ebp
-db	102,15,58,22,251,3
-db	102,15,58,34,253,3
-	mov	ecx,DWORD [240+edx]
-	bswap	ebx
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	movdqa	xmm2,[esp]
-db	102,15,58,34,195,0
-	lea	ebp,[3+ebx]
-db	102,15,58,34,205,0
-	inc	ebx
-db	102,15,58,34,195,1
-	inc	ebp
-db	102,15,58,34,205,1
-	inc	ebx
-db	102,15,58,34,195,2
-	inc	ebp
-db	102,15,58,34,205,2
-	movdqa	[48+esp],xmm0
-db	102,15,56,0,194
-	movdqu	xmm6,[edx]
-	movdqa	[64+esp],xmm1
-db	102,15,56,0,202
-	pshufd	xmm2,xmm0,192
-	pshufd	xmm3,xmm0,128
-	cmp	eax,6
-	jb	NEAR L$040ctr32_tail
-	pxor	xmm7,xmm6
-	shl	ecx,4
-	mov	ebx,16
-	movdqa	[32+esp],xmm7
-	mov	ebp,edx
-	sub	ebx,ecx
-	lea	edx,[32+ecx*1+edx]
-	sub	eax,6
-	jmp	NEAR L$041ctr32_loop6
-align	16
-L$041ctr32_loop6:
-	pshufd	xmm4,xmm0,64
-	movdqa	xmm0,[32+esp]
-	pshufd	xmm5,xmm1,192
-	pxor	xmm2,xmm0
-	pshufd	xmm6,xmm1,128
-	pxor	xmm3,xmm0
-	pshufd	xmm7,xmm1,64
-	movups	xmm1,[16+ebp]
-	pxor	xmm4,xmm0
-	pxor	xmm5,xmm0
-db	102,15,56,220,209
-	pxor	xmm6,xmm0
-	pxor	xmm7,xmm0
-db	102,15,56,220,217
-	movups	xmm0,[32+ebp]
-	mov	ecx,ebx
-db	102,15,56,220,225
-db	102,15,56,220,233
-db	102,15,56,220,241
-db	102,15,56,220,249
-	call	L$_aesni_encrypt6_enter
-	movups	xmm1,[esi]
-	movups	xmm0,[16+esi]
-	xorps	xmm2,xmm1
-	movups	xmm1,[32+esi]
-	xorps	xmm3,xmm0
-	movups	[edi],xmm2
-	movdqa	xmm0,[16+esp]
-	xorps	xmm4,xmm1
-	movdqa	xmm1,[64+esp]
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	paddd	xmm1,xmm0
-	paddd	xmm0,[48+esp]
-	movdqa	xmm2,[esp]
-	movups	xmm3,[48+esi]
-	movups	xmm4,[64+esi]
-	xorps	xmm5,xmm3
-	movups	xmm3,[80+esi]
-	lea	esi,[96+esi]
-	movdqa	[48+esp],xmm0
-db	102,15,56,0,194
-	xorps	xmm6,xmm4
-	movups	[48+edi],xmm5
-	xorps	xmm7,xmm3
-	movdqa	[64+esp],xmm1
-db	102,15,56,0,202
-	movups	[64+edi],xmm6
-	pshufd	xmm2,xmm0,192
-	movups	[80+edi],xmm7
-	lea	edi,[96+edi]
-	pshufd	xmm3,xmm0,128
-	sub	eax,6
-	jnc	NEAR L$041ctr32_loop6
-	add	eax,6
-	jz	NEAR L$042ctr32_ret
-	movdqu	xmm7,[ebp]
-	mov	edx,ebp
-	pxor	xmm7,[32+esp]
-	mov	ecx,DWORD [240+ebp]
-L$040ctr32_tail:
-	por	xmm2,xmm7
-	cmp	eax,2
-	jb	NEAR L$043ctr32_one
-	pshufd	xmm4,xmm0,64
-	por	xmm3,xmm7
-	je	NEAR L$044ctr32_two
-	pshufd	xmm5,xmm1,192
-	por	xmm4,xmm7
-	cmp	eax,4
-	jb	NEAR L$045ctr32_three
-	pshufd	xmm6,xmm1,128
-	por	xmm5,xmm7
-	je	NEAR L$046ctr32_four
-	por	xmm6,xmm7
-	call	__aesni_encrypt6
-	movups	xmm1,[esi]
-	movups	xmm0,[16+esi]
-	xorps	xmm2,xmm1
-	movups	xmm1,[32+esi]
-	xorps	xmm3,xmm0
-	movups	xmm0,[48+esi]
-	xorps	xmm4,xmm1
-	movups	xmm1,[64+esi]
-	xorps	xmm5,xmm0
-	movups	[edi],xmm2
-	xorps	xmm6,xmm1
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-	movups	[64+edi],xmm6
-	jmp	NEAR L$042ctr32_ret
-align	16
-L$039ctr32_one_shortcut:
-	movups	xmm2,[ebx]
-	mov	ecx,DWORD [240+edx]
-L$043ctr32_one:
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$047enc1_loop_7:
-db	102,15,56,220,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$047enc1_loop_7
-db	102,15,56,221,209
-	movups	xmm6,[esi]
-	xorps	xmm6,xmm2
-	movups	[edi],xmm6
-	jmp	NEAR L$042ctr32_ret
-align	16
-L$044ctr32_two:
-	call	__aesni_encrypt2
-	movups	xmm5,[esi]
-	movups	xmm6,[16+esi]
-	xorps	xmm2,xmm5
-	xorps	xmm3,xmm6
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	jmp	NEAR L$042ctr32_ret
-align	16
-L$045ctr32_three:
-	call	__aesni_encrypt3
-	movups	xmm5,[esi]
-	movups	xmm6,[16+esi]
-	xorps	xmm2,xmm5
-	movups	xmm7,[32+esi]
-	xorps	xmm3,xmm6
-	movups	[edi],xmm2
-	xorps	xmm4,xmm7
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	jmp	NEAR L$042ctr32_ret
-align	16
-L$046ctr32_four:
-	call	__aesni_encrypt4
-	movups	xmm6,[esi]
-	movups	xmm7,[16+esi]
-	movups	xmm1,[32+esi]
-	xorps	xmm2,xmm6
-	movups	xmm0,[48+esi]
-	xorps	xmm3,xmm7
-	movups	[edi],xmm2
-	xorps	xmm4,xmm1
-	movups	[16+edi],xmm3
-	xorps	xmm5,xmm0
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-L$042ctr32_ret:
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	pxor	xmm4,xmm4
-	movdqa	[32+esp],xmm0
-	pxor	xmm5,xmm5
-	movdqa	[48+esp],xmm0
-	pxor	xmm6,xmm6
-	movdqa	[64+esp],xmm0
-	pxor	xmm7,xmm7
-	mov	esp,DWORD [80+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_aes_hw_xts_encrypt
-align	16
-_aes_hw_xts_encrypt:
-L$_aes_hw_xts_encrypt_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	edx,DWORD [36+esp]
-	mov	esi,DWORD [40+esp]
-	mov	ecx,DWORD [240+edx]
-	movups	xmm2,[esi]
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$048enc1_loop_8:
-db	102,15,56,220,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$048enc1_loop_8
-db	102,15,56,221,209
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	edx,DWORD [32+esp]
-	mov	ebp,esp
-	sub	esp,120
-	mov	ecx,DWORD [240+edx]
-	and	esp,-16
-	mov	DWORD [96+esp],135
-	mov	DWORD [100+esp],0
-	mov	DWORD [104+esp],1
-	mov	DWORD [108+esp],0
-	mov	DWORD [112+esp],eax
-	mov	DWORD [116+esp],ebp
-	movdqa	xmm1,xmm2
-	pxor	xmm0,xmm0
-	movdqa	xmm3,[96+esp]
-	pcmpgtd	xmm0,xmm1
-	and	eax,-16
-	mov	ebp,edx
-	mov	ebx,ecx
-	sub	eax,96
-	jc	NEAR L$049xts_enc_short
-	shl	ecx,4
-	mov	ebx,16
-	sub	ebx,ecx
-	lea	edx,[32+ecx*1+edx]
-	jmp	NEAR L$050xts_enc_loop6
-align	16
-L$050xts_enc_loop6:
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	[esp],xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	[16+esp],xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	[32+esp],xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	[48+esp],xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	pshufd	xmm7,xmm0,19
-	movdqa	[64+esp],xmm1
-	paddq	xmm1,xmm1
-	movups	xmm0,[ebp]
-	pand	xmm7,xmm3
-	movups	xmm2,[esi]
-	pxor	xmm7,xmm1
-	mov	ecx,ebx
-	movdqu	xmm3,[16+esi]
-	xorps	xmm2,xmm0
-	movdqu	xmm4,[32+esi]
-	pxor	xmm3,xmm0
-	movdqu	xmm5,[48+esi]
-	pxor	xmm4,xmm0
-	movdqu	xmm6,[64+esi]
-	pxor	xmm5,xmm0
-	movdqu	xmm1,[80+esi]
-	pxor	xmm6,xmm0
-	lea	esi,[96+esi]
-	pxor	xmm2,[esp]
-	movdqa	[80+esp],xmm7
-	pxor	xmm7,xmm1
-	movups	xmm1,[16+ebp]
-	pxor	xmm3,[16+esp]
-	pxor	xmm4,[32+esp]
-db	102,15,56,220,209
-	pxor	xmm5,[48+esp]
-	pxor	xmm6,[64+esp]
-db	102,15,56,220,217
-	pxor	xmm7,xmm0
-	movups	xmm0,[32+ebp]
-db	102,15,56,220,225
-db	102,15,56,220,233
-db	102,15,56,220,241
-db	102,15,56,220,249
-	call	L$_aesni_encrypt6_enter
-	movdqa	xmm1,[80+esp]
-	pxor	xmm0,xmm0
-	xorps	xmm2,[esp]
-	pcmpgtd	xmm0,xmm1
-	xorps	xmm3,[16+esp]
-	movups	[edi],xmm2
-	xorps	xmm4,[32+esp]
-	movups	[16+edi],xmm3
-	xorps	xmm5,[48+esp]
-	movups	[32+edi],xmm4
-	xorps	xmm6,[64+esp]
-	movups	[48+edi],xmm5
-	xorps	xmm7,xmm1
-	movups	[64+edi],xmm6
-	pshufd	xmm2,xmm0,19
-	movups	[80+edi],xmm7
-	lea	edi,[96+edi]
-	movdqa	xmm3,[96+esp]
-	pxor	xmm0,xmm0
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	sub	eax,96
-	jnc	NEAR L$050xts_enc_loop6
-	mov	ecx,DWORD [240+ebp]
-	mov	edx,ebp
-	mov	ebx,ecx
-L$049xts_enc_short:
-	add	eax,96
-	jz	NEAR L$051xts_enc_done6x
-	movdqa	xmm5,xmm1
-	cmp	eax,32
-	jb	NEAR L$052xts_enc_one
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	je	NEAR L$053xts_enc_two
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	xmm6,xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	cmp	eax,64
-	jb	NEAR L$054xts_enc_three
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	xmm7,xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	movdqa	[esp],xmm5
-	movdqa	[16+esp],xmm6
-	je	NEAR L$055xts_enc_four
-	movdqa	[32+esp],xmm7
-	pshufd	xmm7,xmm0,19
-	movdqa	[48+esp],xmm1
-	paddq	xmm1,xmm1
-	pand	xmm7,xmm3
-	pxor	xmm7,xmm1
-	movdqu	xmm2,[esi]
-	movdqu	xmm3,[16+esi]
-	movdqu	xmm4,[32+esi]
-	pxor	xmm2,[esp]
-	movdqu	xmm5,[48+esi]
-	pxor	xmm3,[16+esp]
-	movdqu	xmm6,[64+esi]
-	pxor	xmm4,[32+esp]
-	lea	esi,[80+esi]
-	pxor	xmm5,[48+esp]
-	movdqa	[64+esp],xmm7
-	pxor	xmm6,xmm7
-	call	__aesni_encrypt6
-	movaps	xmm1,[64+esp]
-	xorps	xmm2,[esp]
-	xorps	xmm3,[16+esp]
-	xorps	xmm4,[32+esp]
-	movups	[edi],xmm2
-	xorps	xmm5,[48+esp]
-	movups	[16+edi],xmm3
-	xorps	xmm6,xmm1
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-	movups	[64+edi],xmm6
-	lea	edi,[80+edi]
-	jmp	NEAR L$056xts_enc_done
-align	16
-L$052xts_enc_one:
-	movups	xmm2,[esi]
-	lea	esi,[16+esi]
-	xorps	xmm2,xmm5
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$057enc1_loop_9:
-db	102,15,56,220,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$057enc1_loop_9
-db	102,15,56,221,209
-	xorps	xmm2,xmm5
-	movups	[edi],xmm2
-	lea	edi,[16+edi]
-	movdqa	xmm1,xmm5
-	jmp	NEAR L$056xts_enc_done
-align	16
-L$053xts_enc_two:
-	movaps	xmm6,xmm1
-	movups	xmm2,[esi]
-	movups	xmm3,[16+esi]
-	lea	esi,[32+esi]
-	xorps	xmm2,xmm5
-	xorps	xmm3,xmm6
-	call	__aesni_encrypt2
-	xorps	xmm2,xmm5
-	xorps	xmm3,xmm6
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	lea	edi,[32+edi]
-	movdqa	xmm1,xmm6
-	jmp	NEAR L$056xts_enc_done
-align	16
-L$054xts_enc_three:
-	movaps	xmm7,xmm1
-	movups	xmm2,[esi]
-	movups	xmm3,[16+esi]
-	movups	xmm4,[32+esi]
-	lea	esi,[48+esi]
-	xorps	xmm2,xmm5
-	xorps	xmm3,xmm6
-	xorps	xmm4,xmm7
-	call	__aesni_encrypt3
-	xorps	xmm2,xmm5
-	xorps	xmm3,xmm6
-	xorps	xmm4,xmm7
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	lea	edi,[48+edi]
-	movdqa	xmm1,xmm7
-	jmp	NEAR L$056xts_enc_done
-align	16
-L$055xts_enc_four:
-	movaps	xmm6,xmm1
-	movups	xmm2,[esi]
-	movups	xmm3,[16+esi]
-	movups	xmm4,[32+esi]
-	xorps	xmm2,[esp]
-	movups	xmm5,[48+esi]
-	lea	esi,[64+esi]
-	xorps	xmm3,[16+esp]
-	xorps	xmm4,xmm7
-	xorps	xmm5,xmm6
-	call	__aesni_encrypt4
-	xorps	xmm2,[esp]
-	xorps	xmm3,[16+esp]
-	xorps	xmm4,xmm7
-	movups	[edi],xmm2
-	xorps	xmm5,xmm6
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-	lea	edi,[64+edi]
-	movdqa	xmm1,xmm6
-	jmp	NEAR L$056xts_enc_done
-align	16
-L$051xts_enc_done6x:
-	mov	eax,DWORD [112+esp]
-	and	eax,15
-	jz	NEAR L$058xts_enc_ret
-	movdqa	xmm5,xmm1
-	mov	DWORD [112+esp],eax
-	jmp	NEAR L$059xts_enc_steal
-align	16
-L$056xts_enc_done:
-	mov	eax,DWORD [112+esp]
-	pxor	xmm0,xmm0
-	and	eax,15
-	jz	NEAR L$058xts_enc_ret
-	pcmpgtd	xmm0,xmm1
-	mov	DWORD [112+esp],eax
-	pshufd	xmm5,xmm0,19
-	paddq	xmm1,xmm1
-	pand	xmm5,[96+esp]
-	pxor	xmm5,xmm1
-L$059xts_enc_steal:
-	movzx	ecx,BYTE [esi]
-	movzx	edx,BYTE [edi-16]
-	lea	esi,[1+esi]
-	mov	BYTE [edi-16],cl
-	mov	BYTE [edi],dl
-	lea	edi,[1+edi]
-	sub	eax,1
-	jnz	NEAR L$059xts_enc_steal
-	sub	edi,DWORD [112+esp]
-	mov	edx,ebp
-	mov	ecx,ebx
-	movups	xmm2,[edi-16]
-	xorps	xmm2,xmm5
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$060enc1_loop_10:
-db	102,15,56,220,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$060enc1_loop_10
-db	102,15,56,221,209
-	xorps	xmm2,xmm5
-	movups	[edi-16],xmm2
-L$058xts_enc_ret:
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	pxor	xmm2,xmm2
-	movdqa	[esp],xmm0
-	pxor	xmm3,xmm3
-	movdqa	[16+esp],xmm0
-	pxor	xmm4,xmm4
-	movdqa	[32+esp],xmm0
-	pxor	xmm5,xmm5
-	movdqa	[48+esp],xmm0
-	pxor	xmm6,xmm6
-	movdqa	[64+esp],xmm0
-	pxor	xmm7,xmm7
-	movdqa	[80+esp],xmm0
-	mov	esp,DWORD [116+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_aes_hw_xts_decrypt
-align	16
-_aes_hw_xts_decrypt:
-L$_aes_hw_xts_decrypt_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	edx,DWORD [36+esp]
-	mov	esi,DWORD [40+esp]
-	mov	ecx,DWORD [240+edx]
-	movups	xmm2,[esi]
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$061enc1_loop_11:
-db	102,15,56,220,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$061enc1_loop_11
-db	102,15,56,221,209
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	edx,DWORD [32+esp]
-	mov	ebp,esp
-	sub	esp,120
-	and	esp,-16
-	xor	ebx,ebx
-	test	eax,15
-	setnz	bl
-	shl	ebx,4
-	sub	eax,ebx
-	mov	DWORD [96+esp],135
-	mov	DWORD [100+esp],0
-	mov	DWORD [104+esp],1
-	mov	DWORD [108+esp],0
-	mov	DWORD [112+esp],eax
-	mov	DWORD [116+esp],ebp
-	mov	ecx,DWORD [240+edx]
-	mov	ebp,edx
-	mov	ebx,ecx
-	movdqa	xmm1,xmm2
-	pxor	xmm0,xmm0
-	movdqa	xmm3,[96+esp]
-	pcmpgtd	xmm0,xmm1
-	and	eax,-16
-	sub	eax,96
-	jc	NEAR L$062xts_dec_short
-	shl	ecx,4
-	mov	ebx,16
-	sub	ebx,ecx
-	lea	edx,[32+ecx*1+edx]
-	jmp	NEAR L$063xts_dec_loop6
-align	16
-L$063xts_dec_loop6:
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	[esp],xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	[16+esp],xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	[32+esp],xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	[48+esp],xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	pshufd	xmm7,xmm0,19
-	movdqa	[64+esp],xmm1
-	paddq	xmm1,xmm1
-	movups	xmm0,[ebp]
-	pand	xmm7,xmm3
-	movups	xmm2,[esi]
-	pxor	xmm7,xmm1
-	mov	ecx,ebx
-	movdqu	xmm3,[16+esi]
-	xorps	xmm2,xmm0
-	movdqu	xmm4,[32+esi]
-	pxor	xmm3,xmm0
-	movdqu	xmm5,[48+esi]
-	pxor	xmm4,xmm0
-	movdqu	xmm6,[64+esi]
-	pxor	xmm5,xmm0
-	movdqu	xmm1,[80+esi]
-	pxor	xmm6,xmm0
-	lea	esi,[96+esi]
-	pxor	xmm2,[esp]
-	movdqa	[80+esp],xmm7
-	pxor	xmm7,xmm1
-	movups	xmm1,[16+ebp]
-	pxor	xmm3,[16+esp]
-	pxor	xmm4,[32+esp]
-db	102,15,56,222,209
-	pxor	xmm5,[48+esp]
-	pxor	xmm6,[64+esp]
-db	102,15,56,222,217
-	pxor	xmm7,xmm0
-	movups	xmm0,[32+ebp]
-db	102,15,56,222,225
-db	102,15,56,222,233
-db	102,15,56,222,241
-db	102,15,56,222,249
-	call	L$_aesni_decrypt6_enter
-	movdqa	xmm1,[80+esp]
-	pxor	xmm0,xmm0
-	xorps	xmm2,[esp]
-	pcmpgtd	xmm0,xmm1
-	xorps	xmm3,[16+esp]
-	movups	[edi],xmm2
-	xorps	xmm4,[32+esp]
-	movups	[16+edi],xmm3
-	xorps	xmm5,[48+esp]
-	movups	[32+edi],xmm4
-	xorps	xmm6,[64+esp]
-	movups	[48+edi],xmm5
-	xorps	xmm7,xmm1
-	movups	[64+edi],xmm6
-	pshufd	xmm2,xmm0,19
-	movups	[80+edi],xmm7
-	lea	edi,[96+edi]
-	movdqa	xmm3,[96+esp]
-	pxor	xmm0,xmm0
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	sub	eax,96
-	jnc	NEAR L$063xts_dec_loop6
-	mov	ecx,DWORD [240+ebp]
-	mov	edx,ebp
-	mov	ebx,ecx
-L$062xts_dec_short:
-	add	eax,96
-	jz	NEAR L$064xts_dec_done6x
-	movdqa	xmm5,xmm1
-	cmp	eax,32
-	jb	NEAR L$065xts_dec_one
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	je	NEAR L$066xts_dec_two
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	xmm6,xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	cmp	eax,64
-	jb	NEAR L$067xts_dec_three
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	xmm7,xmm1
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-	movdqa	[esp],xmm5
-	movdqa	[16+esp],xmm6
-	je	NEAR L$068xts_dec_four
-	movdqa	[32+esp],xmm7
-	pshufd	xmm7,xmm0,19
-	movdqa	[48+esp],xmm1
-	paddq	xmm1,xmm1
-	pand	xmm7,xmm3
-	pxor	xmm7,xmm1
-	movdqu	xmm2,[esi]
-	movdqu	xmm3,[16+esi]
-	movdqu	xmm4,[32+esi]
-	pxor	xmm2,[esp]
-	movdqu	xmm5,[48+esi]
-	pxor	xmm3,[16+esp]
-	movdqu	xmm6,[64+esi]
-	pxor	xmm4,[32+esp]
-	lea	esi,[80+esi]
-	pxor	xmm5,[48+esp]
-	movdqa	[64+esp],xmm7
-	pxor	xmm6,xmm7
-	call	__aesni_decrypt6
-	movaps	xmm1,[64+esp]
-	xorps	xmm2,[esp]
-	xorps	xmm3,[16+esp]
-	xorps	xmm4,[32+esp]
-	movups	[edi],xmm2
-	xorps	xmm5,[48+esp]
-	movups	[16+edi],xmm3
-	xorps	xmm6,xmm1
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-	movups	[64+edi],xmm6
-	lea	edi,[80+edi]
-	jmp	NEAR L$069xts_dec_done
-align	16
-L$065xts_dec_one:
-	movups	xmm2,[esi]
-	lea	esi,[16+esi]
-	xorps	xmm2,xmm5
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$070dec1_loop_12:
-db	102,15,56,222,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$070dec1_loop_12
-db	102,15,56,223,209
-	xorps	xmm2,xmm5
-	movups	[edi],xmm2
-	lea	edi,[16+edi]
-	movdqa	xmm1,xmm5
-	jmp	NEAR L$069xts_dec_done
-align	16
-L$066xts_dec_two:
-	movaps	xmm6,xmm1
-	movups	xmm2,[esi]
-	movups	xmm3,[16+esi]
-	lea	esi,[32+esi]
-	xorps	xmm2,xmm5
-	xorps	xmm3,xmm6
-	call	__aesni_decrypt2
-	xorps	xmm2,xmm5
-	xorps	xmm3,xmm6
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	lea	edi,[32+edi]
-	movdqa	xmm1,xmm6
-	jmp	NEAR L$069xts_dec_done
-align	16
-L$067xts_dec_three:
-	movaps	xmm7,xmm1
-	movups	xmm2,[esi]
-	movups	xmm3,[16+esi]
-	movups	xmm4,[32+esi]
-	lea	esi,[48+esi]
-	xorps	xmm2,xmm5
-	xorps	xmm3,xmm6
-	xorps	xmm4,xmm7
-	call	__aesni_decrypt3
-	xorps	xmm2,xmm5
-	xorps	xmm3,xmm6
-	xorps	xmm4,xmm7
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	lea	edi,[48+edi]
-	movdqa	xmm1,xmm7
-	jmp	NEAR L$069xts_dec_done
-align	16
-L$068xts_dec_four:
-	movaps	xmm6,xmm1
-	movups	xmm2,[esi]
-	movups	xmm3,[16+esi]
-	movups	xmm4,[32+esi]
-	xorps	xmm2,[esp]
-	movups	xmm5,[48+esi]
-	lea	esi,[64+esi]
-	xorps	xmm3,[16+esp]
-	xorps	xmm4,xmm7
-	xorps	xmm5,xmm6
-	call	__aesni_decrypt4
-	xorps	xmm2,[esp]
-	xorps	xmm3,[16+esp]
-	xorps	xmm4,xmm7
-	movups	[edi],xmm2
-	xorps	xmm5,xmm6
-	movups	[16+edi],xmm3
-	movups	[32+edi],xmm4
-	movups	[48+edi],xmm5
-	lea	edi,[64+edi]
-	movdqa	xmm1,xmm6
-	jmp	NEAR L$069xts_dec_done
-align	16
-L$064xts_dec_done6x:
-	mov	eax,DWORD [112+esp]
-	and	eax,15
-	jz	NEAR L$071xts_dec_ret
-	mov	DWORD [112+esp],eax
-	jmp	NEAR L$072xts_dec_only_one_more
-align	16
-L$069xts_dec_done:
-	mov	eax,DWORD [112+esp]
-	pxor	xmm0,xmm0
-	and	eax,15
-	jz	NEAR L$071xts_dec_ret
-	pcmpgtd	xmm0,xmm1
-	mov	DWORD [112+esp],eax
-	pshufd	xmm2,xmm0,19
-	pxor	xmm0,xmm0
-	movdqa	xmm3,[96+esp]
-	paddq	xmm1,xmm1
-	pand	xmm2,xmm3
-	pcmpgtd	xmm0,xmm1
-	pxor	xmm1,xmm2
-L$072xts_dec_only_one_more:
-	pshufd	xmm5,xmm0,19
-	movdqa	xmm6,xmm1
-	paddq	xmm1,xmm1
-	pand	xmm5,xmm3
-	pxor	xmm5,xmm1
-	mov	edx,ebp
-	mov	ecx,ebx
-	movups	xmm2,[esi]
-	xorps	xmm2,xmm5
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$073dec1_loop_13:
-db	102,15,56,222,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$073dec1_loop_13
-db	102,15,56,223,209
-	xorps	xmm2,xmm5
-	movups	[edi],xmm2
-L$074xts_dec_steal:
-	movzx	ecx,BYTE [16+esi]
-	movzx	edx,BYTE [edi]
-	lea	esi,[1+esi]
-	mov	BYTE [edi],cl
-	mov	BYTE [16+edi],dl
-	lea	edi,[1+edi]
-	sub	eax,1
-	jnz	NEAR L$074xts_dec_steal
-	sub	edi,DWORD [112+esp]
-	mov	edx,ebp
-	mov	ecx,ebx
-	movups	xmm2,[edi]
-	xorps	xmm2,xmm6
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$075dec1_loop_14:
-db	102,15,56,222,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$075dec1_loop_14
-db	102,15,56,223,209
-	xorps	xmm2,xmm6
-	movups	[edi],xmm2
-L$071xts_dec_ret:
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	pxor	xmm2,xmm2
-	movdqa	[esp],xmm0
-	pxor	xmm3,xmm3
-	movdqa	[16+esp],xmm0
-	pxor	xmm4,xmm4
-	movdqa	[32+esp],xmm0
-	pxor	xmm5,xmm5
-	movdqa	[48+esp],xmm0
-	pxor	xmm6,xmm6
-	movdqa	[64+esp],xmm0
-	pxor	xmm7,xmm7
-	movdqa	[80+esp],xmm0
-	mov	esp,DWORD [116+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_aes_hw_cbc_encrypt
-align	16
-_aes_hw_cbc_encrypt:
-L$_aes_hw_cbc_encrypt_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [20+esp]
-	mov	ebx,esp
-	mov	edi,DWORD [24+esp]
-	sub	ebx,24
-	mov	eax,DWORD [28+esp]
-	and	ebx,-16
-	mov	edx,DWORD [32+esp]
-	mov	ebp,DWORD [36+esp]
-	test	eax,eax
-	jz	NEAR L$076cbc_abort
-	cmp	DWORD [40+esp],0
-	xchg	ebx,esp
-	movups	xmm7,[ebp]
-	mov	ecx,DWORD [240+edx]
-	mov	ebp,edx
-	mov	DWORD [16+esp],ebx
-	mov	ebx,ecx
-	je	NEAR L$077cbc_decrypt
-	movaps	xmm2,xmm7
-	cmp	eax,16
-	jb	NEAR L$078cbc_enc_tail
-	sub	eax,16
-	jmp	NEAR L$079cbc_enc_loop
-align	16
-L$079cbc_enc_loop:
-	movups	xmm7,[esi]
-	lea	esi,[16+esi]
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	xorps	xmm7,xmm0
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm7
-L$080enc1_loop_15:
-db	102,15,56,220,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$080enc1_loop_15
-db	102,15,56,221,209
-	mov	ecx,ebx
-	mov	edx,ebp
-	movups	[edi],xmm2
-	lea	edi,[16+edi]
-	sub	eax,16
-	jnc	NEAR L$079cbc_enc_loop
-	add	eax,16
-	jnz	NEAR L$078cbc_enc_tail
-	movaps	xmm7,xmm2
-	pxor	xmm2,xmm2
-	jmp	NEAR L$081cbc_ret
-L$078cbc_enc_tail:
-	mov	ecx,eax
-dd	2767451785
-	mov	ecx,16
-	sub	ecx,eax
-	xor	eax,eax
-dd	2868115081
-	lea	edi,[edi-16]
-	mov	ecx,ebx
-	mov	esi,edi
-	mov	edx,ebp
-	jmp	NEAR L$079cbc_enc_loop
-align	16
-L$077cbc_decrypt:
-	cmp	eax,80
-	jbe	NEAR L$082cbc_dec_tail
-	movaps	[esp],xmm7
-	sub	eax,80
-	jmp	NEAR L$083cbc_dec_loop6_enter
-align	16
-L$084cbc_dec_loop6:
-	movaps	[esp],xmm0
-	movups	[edi],xmm7
-	lea	edi,[16+edi]
-L$083cbc_dec_loop6_enter:
-	movdqu	xmm2,[esi]
-	movdqu	xmm3,[16+esi]
-	movdqu	xmm4,[32+esi]
-	movdqu	xmm5,[48+esi]
-	movdqu	xmm6,[64+esi]
-	movdqu	xmm7,[80+esi]
-	call	__aesni_decrypt6
-	movups	xmm1,[esi]
-	movups	xmm0,[16+esi]
-	xorps	xmm2,[esp]
-	xorps	xmm3,xmm1
-	movups	xmm1,[32+esi]
-	xorps	xmm4,xmm0
-	movups	xmm0,[48+esi]
-	xorps	xmm5,xmm1
-	movups	xmm1,[64+esi]
-	xorps	xmm6,xmm0
-	movups	xmm0,[80+esi]
-	xorps	xmm7,xmm1
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	lea	esi,[96+esi]
-	movups	[32+edi],xmm4
-	mov	ecx,ebx
-	movups	[48+edi],xmm5
-	mov	edx,ebp
-	movups	[64+edi],xmm6
-	lea	edi,[80+edi]
-	sub	eax,96
-	ja	NEAR L$084cbc_dec_loop6
-	movaps	xmm2,xmm7
-	movaps	xmm7,xmm0
-	add	eax,80
-	jle	NEAR L$085cbc_dec_clear_tail_collected
-	movups	[edi],xmm2
-	lea	edi,[16+edi]
-L$082cbc_dec_tail:
-	movups	xmm2,[esi]
-	movaps	xmm6,xmm2
-	cmp	eax,16
-	jbe	NEAR L$086cbc_dec_one
-	movups	xmm3,[16+esi]
-	movaps	xmm5,xmm3
-	cmp	eax,32
-	jbe	NEAR L$087cbc_dec_two
-	movups	xmm4,[32+esi]
-	cmp	eax,48
-	jbe	NEAR L$088cbc_dec_three
-	movups	xmm5,[48+esi]
-	cmp	eax,64
-	jbe	NEAR L$089cbc_dec_four
-	movups	xmm6,[64+esi]
-	movaps	[esp],xmm7
-	movups	xmm2,[esi]
-	xorps	xmm7,xmm7
-	call	__aesni_decrypt6
-	movups	xmm1,[esi]
-	movups	xmm0,[16+esi]
-	xorps	xmm2,[esp]
-	xorps	xmm3,xmm1
-	movups	xmm1,[32+esi]
-	xorps	xmm4,xmm0
-	movups	xmm0,[48+esi]
-	xorps	xmm5,xmm1
-	movups	xmm7,[64+esi]
-	xorps	xmm6,xmm0
-	movups	[edi],xmm2
-	movups	[16+edi],xmm3
-	pxor	xmm3,xmm3
-	movups	[32+edi],xmm4
-	pxor	xmm4,xmm4
-	movups	[48+edi],xmm5
-	pxor	xmm5,xmm5
-	lea	edi,[64+edi]
-	movaps	xmm2,xmm6
-	pxor	xmm6,xmm6
-	sub	eax,80
-	jmp	NEAR L$090cbc_dec_tail_collected
-align	16
-L$086cbc_dec_one:
-	movups	xmm0,[edx]
-	movups	xmm1,[16+edx]
-	lea	edx,[32+edx]
-	xorps	xmm2,xmm0
-L$091dec1_loop_16:
-db	102,15,56,222,209
-	dec	ecx
-	movups	xmm1,[edx]
-	lea	edx,[16+edx]
-	jnz	NEAR L$091dec1_loop_16
-db	102,15,56,223,209
-	xorps	xmm2,xmm7
-	movaps	xmm7,xmm6
-	sub	eax,16
-	jmp	NEAR L$090cbc_dec_tail_collected
-align	16
-L$087cbc_dec_two:
-	call	__aesni_decrypt2
-	xorps	xmm2,xmm7
-	xorps	xmm3,xmm6
-	movups	[edi],xmm2
-	movaps	xmm2,xmm3
-	pxor	xmm3,xmm3
-	lea	edi,[16+edi]
-	movaps	xmm7,xmm5
-	sub	eax,32
-	jmp	NEAR L$090cbc_dec_tail_collected
-align	16
-L$088cbc_dec_three:
-	call	__aesni_decrypt3
-	xorps	xmm2,xmm7
-	xorps	xmm3,xmm6
-	xorps	xmm4,xmm5
-	movups	[edi],xmm2
-	movaps	xmm2,xmm4
-	pxor	xmm4,xmm4
-	movups	[16+edi],xmm3
-	pxor	xmm3,xmm3
-	lea	edi,[32+edi]
-	movups	xmm7,[32+esi]
-	sub	eax,48
-	jmp	NEAR L$090cbc_dec_tail_collected
-align	16
-L$089cbc_dec_four:
-	call	__aesni_decrypt4
-	movups	xmm1,[16+esi]
-	movups	xmm0,[32+esi]
-	xorps	xmm2,xmm7
-	movups	xmm7,[48+esi]
-	xorps	xmm3,xmm6
-	movups	[edi],xmm2
-	xorps	xmm4,xmm1
-	movups	[16+edi],xmm3
-	pxor	xmm3,xmm3
-	xorps	xmm5,xmm0
-	movups	[32+edi],xmm4
-	pxor	xmm4,xmm4
-	lea	edi,[48+edi]
-	movaps	xmm2,xmm5
-	pxor	xmm5,xmm5
-	sub	eax,64
-	jmp	NEAR L$090cbc_dec_tail_collected
-align	16
-L$085cbc_dec_clear_tail_collected:
-	pxor	xmm3,xmm3
-	pxor	xmm4,xmm4
-	pxor	xmm5,xmm5
-	pxor	xmm6,xmm6
-L$090cbc_dec_tail_collected:
-	and	eax,15
-	jnz	NEAR L$092cbc_dec_tail_partial
-	movups	[edi],xmm2
-	pxor	xmm0,xmm0
-	jmp	NEAR L$081cbc_ret
-align	16
-L$092cbc_dec_tail_partial:
-	movaps	[esp],xmm2
-	pxor	xmm0,xmm0
-	mov	ecx,16
-	mov	esi,esp
-	sub	ecx,eax
-dd	2767451785
-	movdqa	[esp],xmm2
-L$081cbc_ret:
-	mov	esp,DWORD [16+esp]
-	mov	ebp,DWORD [36+esp]
-	pxor	xmm2,xmm2
-	pxor	xmm1,xmm1
-	movups	[ebp],xmm7
-	pxor	xmm7,xmm7
-L$076cbc_abort:
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-align	16
-__aesni_set_encrypt_key:
-	push	ebp
-	push	ebx
-	test	eax,eax
-	jz	NEAR L$093bad_pointer
-	test	edx,edx
-	jz	NEAR L$093bad_pointer
-	call	L$094pic
-L$094pic:
-	pop	ebx
-	lea	ebx,[(L$key_const-L$094pic)+ebx]
-	lea	ebp,[_OPENSSL_ia32cap_P]
-	movups	xmm0,[eax]
-	xorps	xmm4,xmm4
-	mov	ebp,DWORD [4+ebp]
-	lea	edx,[16+edx]
-	and	ebp,268437504
-	cmp	ecx,256
-	je	NEAR L$09514rounds
-	cmp	ecx,192
-	je	NEAR L$09612rounds
-	cmp	ecx,128
-	jne	NEAR L$097bad_keybits
-align	16
-L$09810rounds:
-	cmp	ebp,268435456
-	je	NEAR L$09910rounds_alt
-	mov	ecx,9
-	movups	[edx-16],xmm0
-db	102,15,58,223,200,1
-	call	L$100key_128_cold
-db	102,15,58,223,200,2
-	call	L$101key_128
-db	102,15,58,223,200,4
-	call	L$101key_128
-db	102,15,58,223,200,8
-	call	L$101key_128
-db	102,15,58,223,200,16
-	call	L$101key_128
-db	102,15,58,223,200,32
-	call	L$101key_128
-db	102,15,58,223,200,64
-	call	L$101key_128
-db	102,15,58,223,200,128
-	call	L$101key_128
-db	102,15,58,223,200,27
-	call	L$101key_128
-db	102,15,58,223,200,54
-	call	L$101key_128
-	movups	[edx],xmm0
-	mov	DWORD [80+edx],ecx
-	jmp	NEAR L$102good_key
-align	16
-L$101key_128:
-	movups	[edx],xmm0
-	lea	edx,[16+edx]
-L$100key_128_cold:
-	shufps	xmm4,xmm0,16
-	xorps	xmm0,xmm4
-	shufps	xmm4,xmm0,140
-	xorps	xmm0,xmm4
-	shufps	xmm1,xmm1,255
-	xorps	xmm0,xmm1
-	ret
-align	16
-L$09910rounds_alt:
-	movdqa	xmm5,[ebx]
-	mov	ecx,8
-	movdqa	xmm4,[32+ebx]
-	movdqa	xmm2,xmm0
-	movdqu	[edx-16],xmm0
-L$103loop_key128:
-db	102,15,56,0,197
-db	102,15,56,221,196
-	pslld	xmm4,1
-	lea	edx,[16+edx]
-	movdqa	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm2,xmm3
-	pxor	xmm0,xmm2
-	movdqu	[edx-16],xmm0
-	movdqa	xmm2,xmm0
-	dec	ecx
-	jnz	NEAR L$103loop_key128
-	movdqa	xmm4,[48+ebx]
-db	102,15,56,0,197
-db	102,15,56,221,196
-	pslld	xmm4,1
-	movdqa	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm2,xmm3
-	pxor	xmm0,xmm2
-	movdqu	[edx],xmm0
-	movdqa	xmm2,xmm0
-db	102,15,56,0,197
-db	102,15,56,221,196
-	movdqa	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm3,xmm2
-	pslldq	xmm2,4
-	pxor	xmm2,xmm3
-	pxor	xmm0,xmm2
-	movdqu	[16+edx],xmm0
-	mov	ecx,9
-	mov	DWORD [96+edx],ecx
-	jmp	NEAR L$102good_key
-align	16
-L$09612rounds:
-	movq	xmm2,[16+eax]
-	cmp	ebp,268435456
-	je	NEAR L$10412rounds_alt
-	mov	ecx,11
-	movups	[edx-16],xmm0
-db	102,15,58,223,202,1
-	call	L$105key_192a_cold
-db	102,15,58,223,202,2
-	call	L$106key_192b
-db	102,15,58,223,202,4
-	call	L$107key_192a
-db	102,15,58,223,202,8
-	call	L$106key_192b
-db	102,15,58,223,202,16
-	call	L$107key_192a
-db	102,15,58,223,202,32
-	call	L$106key_192b
-db	102,15,58,223,202,64
-	call	L$107key_192a
-db	102,15,58,223,202,128
-	call	L$106key_192b
-	movups	[edx],xmm0
-	mov	DWORD [48+edx],ecx
-	jmp	NEAR L$102good_key
-align	16
-L$107key_192a:
-	movups	[edx],xmm0
-	lea	edx,[16+edx]
-align	16
-L$105key_192a_cold:
-	movaps	xmm5,xmm2
-L$108key_192b_warm:
-	shufps	xmm4,xmm0,16
-	movdqa	xmm3,xmm2
-	xorps	xmm0,xmm4
-	shufps	xmm4,xmm0,140
-	pslldq	xmm3,4
-	xorps	xmm0,xmm4
-	pshufd	xmm1,xmm1,85
-	pxor	xmm2,xmm3
-	pxor	xmm0,xmm1
-	pshufd	xmm3,xmm0,255
-	pxor	xmm2,xmm3
-	ret
-align	16
-L$106key_192b:
-	movaps	xmm3,xmm0
-	shufps	xmm5,xmm0,68
-	movups	[edx],xmm5
-	shufps	xmm3,xmm2,78
-	movups	[16+edx],xmm3
-	lea	edx,[32+edx]
-	jmp	NEAR L$108key_192b_warm
-align	16
-L$10412rounds_alt:
-	movdqa	xmm5,[16+ebx]
-	movdqa	xmm4,[32+ebx]
-	mov	ecx,8
-	movdqu	[edx-16],xmm0
-L$109loop_key192:
-	movq	[edx],xmm2
-	movdqa	xmm1,xmm2
-db	102,15,56,0,213
-db	102,15,56,221,212
-	pslld	xmm4,1
-	lea	edx,[24+edx]
-	movdqa	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm0,xmm3
-	pshufd	xmm3,xmm0,255
-	pxor	xmm3,xmm1
-	pslldq	xmm1,4
-	pxor	xmm3,xmm1
-	pxor	xmm0,xmm2
-	pxor	xmm2,xmm3
-	movdqu	[edx-16],xmm0
-	dec	ecx
-	jnz	NEAR L$109loop_key192
-	mov	ecx,11
-	mov	DWORD [32+edx],ecx
-	jmp	NEAR L$102good_key
-align	16
-L$09514rounds:
-	movups	xmm2,[16+eax]
-	lea	edx,[16+edx]
-	cmp	ebp,268435456
-	je	NEAR L$11014rounds_alt
-	mov	ecx,13
-	movups	[edx-32],xmm0
-	movups	[edx-16],xmm2
-db	102,15,58,223,202,1
-	call	L$111key_256a_cold
-db	102,15,58,223,200,1
-	call	L$112key_256b
-db	102,15,58,223,202,2
-	call	L$113key_256a
-db	102,15,58,223,200,2
-	call	L$112key_256b
-db	102,15,58,223,202,4
-	call	L$113key_256a
-db	102,15,58,223,200,4
-	call	L$112key_256b
-db	102,15,58,223,202,8
-	call	L$113key_256a
-db	102,15,58,223,200,8
-	call	L$112key_256b
-db	102,15,58,223,202,16
-	call	L$113key_256a
-db	102,15,58,223,200,16
-	call	L$112key_256b
-db	102,15,58,223,202,32
-	call	L$113key_256a
-db	102,15,58,223,200,32
-	call	L$112key_256b
-db	102,15,58,223,202,64
-	call	L$113key_256a
-	movups	[edx],xmm0
-	mov	DWORD [16+edx],ecx
-	xor	eax,eax
-	jmp	NEAR L$102good_key
-align	16
-L$113key_256a:
-	movups	[edx],xmm2
-	lea	edx,[16+edx]
-L$111key_256a_cold:
-	shufps	xmm4,xmm0,16
-	xorps	xmm0,xmm4
-	shufps	xmm4,xmm0,140
-	xorps	xmm0,xmm4
-	shufps	xmm1,xmm1,255
-	xorps	xmm0,xmm1
-	ret
-align	16
-L$112key_256b:
-	movups	[edx],xmm0
-	lea	edx,[16+edx]
-	shufps	xmm4,xmm2,16
-	xorps	xmm2,xmm4
-	shufps	xmm4,xmm2,140
-	xorps	xmm2,xmm4
-	shufps	xmm1,xmm1,170
-	xorps	xmm2,xmm1
-	ret
-align	16
-L$11014rounds_alt:
-	movdqa	xmm5,[ebx]
-	movdqa	xmm4,[32+ebx]
-	mov	ecx,7
-	movdqu	[edx-32],xmm0
-	movdqa	xmm1,xmm2
-	movdqu	[edx-16],xmm2
-L$114loop_key256:
-db	102,15,56,0,213
-db	102,15,56,221,212
-	movdqa	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm3,xmm0
-	pslldq	xmm0,4
-	pxor	xmm0,xmm3
-	pslld	xmm4,1
-	pxor	xmm0,xmm2
-	movdqu	[edx],xmm0
-	dec	ecx
-	jz	NEAR L$115done_key256
-	pshufd	xmm2,xmm0,255
-	pxor	xmm3,xmm3
-db	102,15,56,221,211
-	movdqa	xmm3,xmm1
-	pslldq	xmm1,4
-	pxor	xmm3,xmm1
-	pslldq	xmm1,4
-	pxor	xmm3,xmm1
-	pslldq	xmm1,4
-	pxor	xmm1,xmm3
-	pxor	xmm2,xmm1
-	movdqu	[16+edx],xmm2
-	lea	edx,[32+edx]
-	movdqa	xmm1,xmm2
-	jmp	NEAR L$114loop_key256
-L$115done_key256:
-	mov	ecx,13
-	mov	DWORD [16+edx],ecx
-L$102good_key:
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	pxor	xmm4,xmm4
-	pxor	xmm5,xmm5
-	xor	eax,eax
-	pop	ebx
-	pop	ebp
-	ret
-align	4
-L$093bad_pointer:
-	mov	eax,-1
-	pop	ebx
-	pop	ebp
-	ret
-align	4
-L$097bad_keybits:
-	pxor	xmm0,xmm0
-	mov	eax,-2
-	pop	ebx
-	pop	ebp
-	ret
-global	_aes_hw_set_encrypt_key
-align	16
-_aes_hw_set_encrypt_key:
-L$_aes_hw_set_encrypt_key_begin:
-%ifdef BORINGSSL_DISPATCH_TEST
-	push	ebx
-	push	edx
-	call	L$116pic
-L$116pic:
-	pop	ebx
-	lea	ebx,[(_BORINGSSL_function_hit+3-L$116pic)+ebx]
-	mov	edx,1
-	mov	BYTE [ebx],dl
-	pop	edx
-	pop	ebx
-%endif
-	mov	eax,DWORD [4+esp]
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	call	__aesni_set_encrypt_key
-	ret
-global	_aes_hw_set_decrypt_key
-align	16
-_aes_hw_set_decrypt_key:
-L$_aes_hw_set_decrypt_key_begin:
-	mov	eax,DWORD [4+esp]
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	call	__aesni_set_encrypt_key
-	mov	edx,DWORD [12+esp]
-	shl	ecx,4
-	test	eax,eax
-	jnz	NEAR L$117dec_key_ret
-	lea	eax,[16+ecx*1+edx]
-	movups	xmm0,[edx]
-	movups	xmm1,[eax]
-	movups	[eax],xmm0
-	movups	[edx],xmm1
-	lea	edx,[16+edx]
-	lea	eax,[eax-16]
-L$118dec_key_inverse:
-	movups	xmm0,[edx]
-	movups	xmm1,[eax]
-db	102,15,56,219,192
-db	102,15,56,219,201
-	lea	edx,[16+edx]
-	lea	eax,[eax-16]
-	movups	[16+eax],xmm0
-	movups	[edx-16],xmm1
-	cmp	eax,edx
-	ja	NEAR L$118dec_key_inverse
-	movups	xmm0,[edx]
-db	102,15,56,219,192
-	movups	[edx],xmm0
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	xor	eax,eax
-L$117dec_key_ret:
-	ret
-align	64
-L$key_const:
-dd	202313229,202313229,202313229,202313229
-dd	67569157,67569157,67569157,67569157
-dd	1,1,1,1
-dd	27,27,27,27
-db	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
-db	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
-db	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
-db	115,108,46,111,114,103,62,0
-segment	.bss
-common	_OPENSSL_ia32cap_P 16
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-ios.ios.arm.S
deleted file mode 100644
index 9d4c1984..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-ios.ios.arm.S
+++ /dev/null
@@ -1,808 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__APPLE__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <CCryptoBoringSSL_arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-
-.code	32
-#undef	__thumb2__
-.align	5
-Lrcon:
-.long	0x01,0x01,0x01,0x01
-.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
-.long	0x1b,0x1b,0x1b,0x1b
-
-.text
-
-.globl	_aes_hw_set_encrypt_key
-.private_extern	_aes_hw_set_encrypt_key
-#ifdef __thumb2__
-.thumb_func	_aes_hw_set_encrypt_key
-#endif
-.align	5
-_aes_hw_set_encrypt_key:
-Lenc_key:
-	mov	r3,#-1
-	cmp	r0,#0
-	beq	Lenc_key_abort
-	cmp	r2,#0
-	beq	Lenc_key_abort
-	mov	r3,#-2
-	cmp	r1,#128
-	blt	Lenc_key_abort
-	cmp	r1,#256
-	bgt	Lenc_key_abort
-	tst	r1,#0x3f
-	bne	Lenc_key_abort
-
-	adr	r3,Lrcon
-	cmp	r1,#192
-
-	veor	q0,q0,q0
-	vld1.8	{q3},[r0]!
-	mov	r1,#8		@ reuse r1
-	vld1.32	{q1,q2},[r3]!
-
-	blt	Loop128
-	beq	L192
-	b	L256
-
-.align	4
-Loop128:
-	vtbl.8	d20,{q3},d4
-	vtbl.8	d21,{q3},d5
-	vext.8	q9,q0,q3,#12
-	vst1.32	{q3},[r2]!
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-	subs	r1,r1,#1
-
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q10,q10,q1
-	veor	q3,q3,q9
-	vshl.u8	q1,q1,#1
-	veor	q3,q3,q10
-	bne	Loop128
-
-	vld1.32	{q1},[r3]
-
-	vtbl.8	d20,{q3},d4
-	vtbl.8	d21,{q3},d5
-	vext.8	q9,q0,q3,#12
-	vst1.32	{q3},[r2]!
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q10,q10,q1
-	veor	q3,q3,q9
-	vshl.u8	q1,q1,#1
-	veor	q3,q3,q10
-
-	vtbl.8	d20,{q3},d4
-	vtbl.8	d21,{q3},d5
-	vext.8	q9,q0,q3,#12
-	vst1.32	{q3},[r2]!
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q10,q10,q1
-	veor	q3,q3,q9
-	veor	q3,q3,q10
-	vst1.32	{q3},[r2]
-	add	r2,r2,#0x50
-
-	mov	r12,#10
-	b	Ldone
-
-.align	4
-L192:
-	vld1.8	{d16},[r0]!
-	vmov.i8	q10,#8			@ borrow q10
-	vst1.32	{q3},[r2]!
-	vsub.i8	q2,q2,q10	@ adjust the mask
-
-Loop192:
-	vtbl.8	d20,{q8},d4
-	vtbl.8	d21,{q8},d5
-	vext.8	q9,q0,q3,#12
-	vst1.32	{d16},[r2]!
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-	subs	r1,r1,#1
-
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-
-	vdup.32	q9,d7[1]
-	veor	q9,q9,q8
-	veor	q10,q10,q1
-	vext.8	q8,q0,q8,#12
-	vshl.u8	q1,q1,#1
-	veor	q8,q8,q9
-	veor	q3,q3,q10
-	veor	q8,q8,q10
-	vst1.32	{q3},[r2]!
-	bne	Loop192
-
-	mov	r12,#12
-	add	r2,r2,#0x20
-	b	Ldone
-
-.align	4
-L256:
-	vld1.8	{q8},[r0]
-	mov	r1,#7
-	mov	r12,#14
-	vst1.32	{q3},[r2]!
-
-Loop256:
-	vtbl.8	d20,{q8},d4
-	vtbl.8	d21,{q8},d5
-	vext.8	q9,q0,q3,#12
-	vst1.32	{q8},[r2]!
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-	subs	r1,r1,#1
-
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q10,q10,q1
-	veor	q3,q3,q9
-	vshl.u8	q1,q1,#1
-	veor	q3,q3,q10
-	vst1.32	{q3},[r2]!
-	beq	Ldone
-
-	vdup.32	q10,d7[1]
-	vext.8	q9,q0,q8,#12
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-
-	veor	q8,q8,q9
-	vext.8	q9,q0,q9,#12
-	veor	q8,q8,q9
-	vext.8	q9,q0,q9,#12
-	veor	q8,q8,q9
-
-	veor	q8,q8,q10
-	b	Loop256
-
-Ldone:
-	str	r12,[r2]
-	mov	r3,#0
-
-Lenc_key_abort:
-	mov	r0,r3			@ return value
-
-	bx	lr
-
-
-.globl	_aes_hw_set_decrypt_key
-.private_extern	_aes_hw_set_decrypt_key
-#ifdef __thumb2__
-.thumb_func	_aes_hw_set_decrypt_key
-#endif
-.align	5
-_aes_hw_set_decrypt_key:
-	stmdb	sp!,{r4,lr}
-	bl	Lenc_key
-
-	cmp	r0,#0
-	bne	Ldec_key_abort
-
-	sub	r2,r2,#240		@ restore original r2
-	mov	r4,#-16
-	add	r0,r2,r12,lsl#4	@ end of key schedule
-
-	vld1.32	{q0},[r2]
-	vld1.32	{q1},[r0]
-	vst1.32	{q0},[r0],r4
-	vst1.32	{q1},[r2]!
-
-Loop_imc:
-	vld1.32	{q0},[r2]
-	vld1.32	{q1},[r0]
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-	vst1.32	{q0},[r0],r4
-	vst1.32	{q1},[r2]!
-	cmp	r0,r2
-	bhi	Loop_imc
-
-	vld1.32	{q0},[r2]
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-	vst1.32	{q0},[r0]
-
-	eor	r0,r0,r0		@ return value
-Ldec_key_abort:
-	ldmia	sp!,{r4,pc}
-
-.globl	_aes_hw_encrypt
-.private_extern	_aes_hw_encrypt
-#ifdef __thumb2__
-.thumb_func	_aes_hw_encrypt
-#endif
-.align	5
-_aes_hw_encrypt:
-	AARCH64_VALID_CALL_TARGET
-	ldr	r3,[r2,#240]
-	vld1.32	{q0},[r2]!
-	vld1.8	{q2},[r0]
-	sub	r3,r3,#2
-	vld1.32	{q1},[r2]!
-
-Loop_enc:
-.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
-.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
-	vld1.32	{q0},[r2]!
-	subs	r3,r3,#2
-.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
-.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
-	vld1.32	{q1},[r2]!
-	bgt	Loop_enc
-
-.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
-.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
-	vld1.32	{q0},[r2]
-.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
-	veor	q2,q2,q0
-
-	vst1.8	{q2},[r1]
-	bx	lr
-
-.globl	_aes_hw_decrypt
-.private_extern	_aes_hw_decrypt
-#ifdef __thumb2__
-.thumb_func	_aes_hw_decrypt
-#endif
-.align	5
-_aes_hw_decrypt:
-	AARCH64_VALID_CALL_TARGET
-	ldr	r3,[r2,#240]
-	vld1.32	{q0},[r2]!
-	vld1.8	{q2},[r0]
-	sub	r3,r3,#2
-	vld1.32	{q1},[r2]!
-
-Loop_dec:
-.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
-.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
-	vld1.32	{q0},[r2]!
-	subs	r3,r3,#2
-.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
-.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
-	vld1.32	{q1},[r2]!
-	bgt	Loop_dec
-
-.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
-.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
-	vld1.32	{q0},[r2]
-.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
-	veor	q2,q2,q0
-
-	vst1.8	{q2},[r1]
-	bx	lr
-
-.globl	_aes_hw_cbc_encrypt
-.private_extern	_aes_hw_cbc_encrypt
-#ifdef __thumb2__
-.thumb_func	_aes_hw_cbc_encrypt
-#endif
-.align	5
-_aes_hw_cbc_encrypt:
-	mov	ip,sp
-	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
-	ldmia	ip,{r4,r5}		@ load remaining args
-	subs	r2,r2,#16
-	mov	r8,#16
-	blo	Lcbc_abort
-	moveq	r8,#0
-
-	cmp	r5,#0			@ en- or decrypting?
-	ldr	r5,[r3,#240]
-	and	r2,r2,#-16
-	vld1.8	{q6},[r4]
-	vld1.8	{q0},[r0],r8
-
-	vld1.32	{q8,q9},[r3]		@ load key schedule...
-	sub	r5,r5,#6
-	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
-	sub	r5,r5,#2
-	vld1.32	{q10,q11},[r7]!
-	vld1.32	{q12,q13},[r7]!
-	vld1.32	{q14,q15},[r7]!
-	vld1.32	{q7},[r7]
-
-	add	r7,r3,#32
-	mov	r6,r5
-	beq	Lcbc_dec
-
-	cmp	r5,#2
-	veor	q0,q0,q6
-	veor	q5,q8,q7
-	beq	Lcbc_enc128
-
-	vld1.32	{q2,q3},[r7]
-	add	r7,r3,#16
-	add	r6,r3,#16*4
-	add	r12,r3,#16*5
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	add	r14,r3,#16*6
-	add	r3,r3,#16*7
-	b	Lenter_cbc_enc
-
-.align	4
-Loop_cbc_enc:
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vst1.8	{q6},[r1]!
-Lenter_cbc_enc:
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.32	{q8},[r6]
-	cmp	r5,#4
-.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.32	{q9},[r12]
-	beq	Lcbc_enc192
-
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.32	{q8},[r14]
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.32	{q9},[r3]
-	nop
-
-Lcbc_enc192:
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	subs	r2,r2,#16
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	moveq	r8,#0
-.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.8	{q8},[r0],r8
-.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	veor	q8,q8,q5
-.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
-.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
-	veor	q6,q0,q7
-	bhs	Loop_cbc_enc
-
-	vst1.8	{q6},[r1]!
-	b	Lcbc_done
-
-.align	5
-Lcbc_enc128:
-	vld1.32	{q2,q3},[r7]
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	b	Lenter_cbc_enc128
-Loop_cbc_enc128:
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vst1.8	{q6},[r1]!
-Lenter_cbc_enc128:
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	subs	r2,r2,#16
-.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	moveq	r8,#0
-.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.8	{q8},[r0],r8
-.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	veor	q8,q8,q5
-.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
-	veor	q6,q0,q7
-	bhs	Loop_cbc_enc128
-
-	vst1.8	{q6},[r1]!
-	b	Lcbc_done
-.align	5
-Lcbc_dec:
-	vld1.8	{q10},[r0]!
-	subs	r2,r2,#32		@ bias
-	add	r6,r5,#2
-	vorr	q3,q0,q0
-	vorr	q1,q0,q0
-	vorr	q11,q10,q10
-	blo	Lcbc_dec_tail
-
-	vorr	q1,q10,q10
-	vld1.8	{q10},[r0]!
-	vorr	q2,q0,q0
-	vorr	q3,q1,q1
-	vorr	q11,q10,q10
-
-Loop3x_cbc_dec:
-.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.32	{q8},[r7]!
-	subs	r6,r6,#2
-.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.32	{q9},[r7]!
-	bgt	Loop3x_cbc_dec
-
-.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	veor	q4,q6,q7
-	subs	r2,r2,#0x30
-	veor	q5,q2,q7
-	movlo	r6,r2			@ r6, r6, is zero at this point
-.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	veor	q9,q3,q7
-	add	r0,r0,r6		@ r0 is adjusted in such way that
-					@ at exit from the loop q1-q10
-					@ are loaded with last "words"
-	vorr	q6,q11,q11
-	mov	r7,r3
-.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.8	{q2},[r0]!
-.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.8	{q3},[r0]!
-.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.8	{q11},[r0]!
-.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
-.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
-.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
-	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
-	add	r6,r5,#2
-	veor	q4,q4,q0
-	veor	q5,q5,q1
-	veor	q10,q10,q9
-	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
-	vst1.8	{q4},[r1]!
-	vorr	q0,q2,q2
-	vst1.8	{q5},[r1]!
-	vorr	q1,q3,q3
-	vst1.8	{q10},[r1]!
-	vorr	q10,q11,q11
-	bhs	Loop3x_cbc_dec
-
-	cmn	r2,#0x30
-	beq	Lcbc_done
-	nop
-
-Lcbc_dec_tail:
-.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.32	{q8},[r7]!
-	subs	r6,r6,#2
-.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.32	{q9},[r7]!
-	bgt	Lcbc_dec_tail
-
-.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	cmn	r2,#0x20
-.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	veor	q5,q6,q7
-.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	veor	q9,q3,q7
-.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
-.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
-	beq	Lcbc_dec_one
-	veor	q5,q5,q1
-	veor	q9,q9,q10
-	vorr	q6,q11,q11
-	vst1.8	{q5},[r1]!
-	vst1.8	{q9},[r1]!
-	b	Lcbc_done
-
-Lcbc_dec_one:
-	veor	q5,q5,q10
-	vorr	q6,q11,q11
-	vst1.8	{q5},[r1]!
-
-Lcbc_done:
-	vst1.8	{q6},[r4]
-Lcbc_abort:
-	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!,{r4,r5,r6,r7,r8,pc}
-
-.globl	_aes_hw_ctr32_encrypt_blocks
-.private_extern	_aes_hw_ctr32_encrypt_blocks
-#ifdef __thumb2__
-.thumb_func	_aes_hw_ctr32_encrypt_blocks
-#endif
-.align	5
-_aes_hw_ctr32_encrypt_blocks:
-	mov	ip,sp
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
-	ldr	r4, [ip]		@ load remaining arg
-	ldr	r5,[r3,#240]
-
-	ldr	r8, [r4, #12]
-	vld1.32	{q0},[r4]
-
-	vld1.32	{q8,q9},[r3]		@ load key schedule...
-	sub	r5,r5,#4
-	mov	r12,#16
-	cmp	r2,#2
-	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
-	sub	r5,r5,#2
-	vld1.32	{q12,q13},[r7]!
-	vld1.32	{q14,q15},[r7]!
-	vld1.32	{q7},[r7]
-	add	r7,r3,#32
-	mov	r6,r5
-	movlo	r12,#0
-
-	@ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
-	@ affected by silicon errata #1742098 [0] and #1655431 [1],
-	@ respectively, where the second instruction of an aese/aesmc
-	@ instruction pair may execute twice if an interrupt is taken right
-	@ after the first instruction consumes an input register of which a
-	@ single 32-bit lane has been updated the last time it was modified.
-	@ 
-	@ This function uses a counter in one 32-bit lane. The 
-	@ could write to q1 and q10 directly, but that trips this bugs.
-	@ We write to q6 and copy to the final register as a workaround.
-	@ 
-	@ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
-	@ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
-#ifndef __ARMEB__
-	rev	r8, r8
-#endif
-	add	r10, r8, #1
-	vorr	q6,q0,q0
-	rev	r10, r10
-	vmov.32	d13[1],r10
-	add	r8, r8, #2
-	vorr	q1,q6,q6
-	bls	Lctr32_tail
-	rev	r12, r8
-	vmov.32	d13[1],r12
-	sub	r2,r2,#3		@ bias
-	vorr	q10,q6,q6
-	b	Loop3x_ctr32
-
-.align	4
-Loop3x_ctr32:
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
-.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
-	vld1.32	{q8},[r7]!
-	subs	r6,r6,#2
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
-.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
-	vld1.32	{q9},[r7]!
-	bgt	Loop3x_ctr32
-
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
-.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
-.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
-	vld1.8	{q2},[r0]!
-	add	r9,r8,#1
-.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
-.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
-	vld1.8	{q3},[r0]!
-	rev	r9,r9
-.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
-.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
-.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
-.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
-	vld1.8	{q11},[r0]!
-	mov	r7,r3
-.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
-.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
-.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
-.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
-.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
-.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
-	veor	q2,q2,q7
-	add	r10,r8,#2
-.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
-.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
-	veor	q3,q3,q7
-	add	r8,r8,#3
-.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
-.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
-.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
-.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
-	 @ Note the logic to update q0, q1, and q1 is written to work
-	 @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
-	 @ 32-bit mode. See the comment above.
-	veor	q11,q11,q7
-	vmov.32	d13[1], r9
-.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
-.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
-	vorr	q0,q6,q6
-	rev	r10,r10
-.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
-.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
-	vmov.32	d13[1], r10
-	rev	r12,r8
-.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
-.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
-	vorr	q1,q6,q6
-	vmov.32	d13[1], r12
-.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
-.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
-	vorr	q10,q6,q6
-	subs	r2,r2,#3
-.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
-.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
-.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
-
-	veor	q2,q2,q4
-	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
-	vst1.8	{q2},[r1]!
-	veor	q3,q3,q5
-	mov	r6,r5
-	vst1.8	{q3},[r1]!
-	veor	q11,q11,q9
-	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
-	vst1.8	{q11},[r1]!
-	bhs	Loop3x_ctr32
-
-	adds	r2,r2,#3
-	beq	Lctr32_done
-	cmp	r2,#1
-	mov	r12,#16
-	moveq	r12,#0
-
-Lctr32_tail:
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	vld1.32	{q8},[r7]!
-	subs	r6,r6,#2
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	vld1.32	{q9},[r7]!
-	bgt	Lctr32_tail
-
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	vld1.8	{q2},[r0],r12
-.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	vld1.8	{q3},[r0]
-.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	veor	q2,q2,q7
-.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	veor	q3,q3,q7
-.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
-.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
-
-	cmp	r2,#1
-	veor	q2,q2,q0
-	veor	q3,q3,q1
-	vst1.8	{q2},[r1]!
-	beq	Lctr32_done
-	vst1.8	{q3},[r1]
-
-Lctr32_done:
-	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
-
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
-#endif  // defined(__arm__) && defined(__APPLE__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-ios.ios.arm.S
deleted file mode 100644
index be0d2bb9..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-ios.ios.arm.S
+++ /dev/null
@@ -1,950 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__APPLE__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <CCryptoBoringSSL_arm_arch.h>
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
-
-
-.text
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#endif
-
-.globl	_bn_mul_mont_nohw
-.private_extern	_bn_mul_mont_nohw
-#ifdef __thumb2__
-.thumb_func	_bn_mul_mont_nohw
-#endif
-
-.align	5
-_bn_mul_mont_nohw:
-	ldr	ip,[sp,#4]		@ load num
-	stmdb	sp!,{r0,r2}		@ sp points at argument block
-	cmp	ip,#2
-	mov	r0,ip			@ load num
-#ifdef	__thumb2__
-	ittt	lt
-#endif
-	movlt	r0,#0
-	addlt	sp,sp,#2*4
-	blt	Labrt
-
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ save 10 registers
-
-	mov	r0,r0,lsl#2		@ rescale r0 for byte count
-	sub	sp,sp,r0		@ alloca(4*num)
-	sub	sp,sp,#4		@ +extra dword
-	sub	r0,r0,#4		@ "num=num-1"
-	add	r4,r2,r0		@ &bp[num-1]
-
-	add	r0,sp,r0		@ r0 to point at &tp[num-1]
-	ldr	r8,[r0,#14*4]		@ &n0
-	ldr	r2,[r2]		@ bp[0]
-	ldr	r5,[r1],#4		@ ap[0],ap++
-	ldr	r6,[r3],#4		@ np[0],np++
-	ldr	r8,[r8]		@ *n0
-	str	r4,[r0,#15*4]		@ save &bp[num]
-
-	umull	r10,r11,r5,r2	@ ap[0]*bp[0]
-	str	r8,[r0,#14*4]		@ save n0 value
-	mul	r8,r10,r8		@ "tp[0]"*n0
-	mov	r12,#0
-	umlal	r10,r12,r6,r8	@ np[0]*n0+"t[0]"
-	mov	r4,sp
-
-L1st:
-	ldr	r5,[r1],#4		@ ap[j],ap++
-	mov	r10,r11
-	ldr	r6,[r3],#4		@ np[j],np++
-	mov	r11,#0
-	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
-	mov	r14,#0
-	umlal	r12,r14,r6,r8	@ np[j]*n0
-	adds	r12,r12,r10
-	str	r12,[r4],#4		@ tp[j-1]=,tp++
-	adc	r12,r14,#0
-	cmp	r4,r0
-	bne	L1st
-
-	adds	r12,r12,r11
-	ldr	r4,[r0,#13*4]		@ restore bp
-	mov	r14,#0
-	ldr	r8,[r0,#14*4]		@ restore n0
-	adc	r14,r14,#0
-	str	r12,[r0]		@ tp[num-1]=
-	mov	r7,sp
-	str	r14,[r0,#4]		@ tp[num]=
-
-Louter:
-	sub	r7,r0,r7		@ "original" r0-1 value
-	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
-	ldr	r2,[r4,#4]!		@ *(++bp)
-	sub	r3,r3,r7		@ "rewind" np to &np[1]
-	ldr	r5,[r1,#-4]		@ ap[0]
-	ldr	r10,[sp]		@ tp[0]
-	ldr	r6,[r3,#-4]		@ np[0]
-	ldr	r7,[sp,#4]		@ tp[1]
-
-	mov	r11,#0
-	umlal	r10,r11,r5,r2	@ ap[0]*bp[i]+tp[0]
-	str	r4,[r0,#13*4]		@ save bp
-	mul	r8,r10,r8
-	mov	r12,#0
-	umlal	r10,r12,r6,r8	@ np[0]*n0+"tp[0]"
-	mov	r4,sp
-
-Linner:
-	ldr	r5,[r1],#4		@ ap[j],ap++
-	adds	r10,r11,r7		@ +=tp[j]
-	ldr	r6,[r3],#4		@ np[j],np++
-	mov	r11,#0
-	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
-	mov	r14,#0
-	umlal	r12,r14,r6,r8	@ np[j]*n0
-	adc	r11,r11,#0
-	ldr	r7,[r4,#8]		@ tp[j+1]
-	adds	r12,r12,r10
-	str	r12,[r4],#4		@ tp[j-1]=,tp++
-	adc	r12,r14,#0
-	cmp	r4,r0
-	bne	Linner
-
-	adds	r12,r12,r11
-	mov	r14,#0
-	ldr	r4,[r0,#13*4]		@ restore bp
-	adc	r14,r14,#0
-	ldr	r8,[r0,#14*4]		@ restore n0
-	adds	r12,r12,r7
-	ldr	r7,[r0,#15*4]		@ restore &bp[num]
-	adc	r14,r14,#0
-	str	r12,[r0]		@ tp[num-1]=
-	str	r14,[r0,#4]		@ tp[num]=
-
-	cmp	r4,r7
-#ifdef	__thumb2__
-	itt	ne
-#endif
-	movne	r7,sp
-	bne	Louter
-
-	ldr	r2,[r0,#12*4]		@ pull rp
-	mov	r5,sp
-	add	r0,r0,#4		@ r0 to point at &tp[num]
-	sub	r5,r0,r5		@ "original" num value
-	mov	r4,sp			@ "rewind" r4
-	mov	r1,r4			@ "borrow" r1
-	sub	r3,r3,r5		@ "rewind" r3 to &np[0]
-
-	subs	r7,r7,r7		@ "clear" carry flag
-Lsub:	ldr	r7,[r4],#4
-	ldr	r6,[r3],#4
-	sbcs	r7,r7,r6		@ tp[j]-np[j]
-	str	r7,[r2],#4		@ rp[j]=
-	teq	r4,r0		@ preserve carry
-	bne	Lsub
-	sbcs	r14,r14,#0		@ upmost carry
-	mov	r4,sp			@ "rewind" r4
-	sub	r2,r2,r5		@ "rewind" r2
-
-Lcopy:	ldr	r7,[r4]		@ conditional copy
-	ldr	r5,[r2]
-	str	sp,[r4],#4		@ zap tp
-#ifdef	__thumb2__
-	it	cc
-#endif
-	movcc	r5,r7
-	str	r5,[r2],#4
-	teq	r4,r0		@ preserve carry
-	bne	Lcopy
-
-	mov	sp,r0
-	add	sp,sp,#4		@ skip over tp[num+1]
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ restore registers
-	add	sp,sp,#2*4		@ skip over {r0,r2}
-	mov	r0,#1
-Labrt:
-#if __ARM_ARCH>=5
-	bx	lr				@ bx lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl	_bn_mul8x_mont_neon
-.private_extern	_bn_mul8x_mont_neon
-#ifdef __thumb2__
-.thumb_func	_bn_mul8x_mont_neon
-#endif
-.align	5
-_bn_mul8x_mont_neon:
-	mov	ip,sp
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
-	ldmia	ip,{r4,r5}		@ load rest of parameter block
-	mov	ip,sp
-
-	cmp	r5,#8
-	bhi	LNEON_8n
-
-	@ special case for r5==8, everything is in register bank...
-
-	vld1.32	{d28[0]}, [r2,:32]!
-	veor	d8,d8,d8
-	sub	r7,sp,r5,lsl#4
-	vld1.32	{d0,d1,d2,d3},  [r1]!		@ can't specify :32 :-(
-	and	r7,r7,#-64
-	vld1.32	{d30[0]}, [r4,:32]
-	mov	sp,r7			@ alloca
-	vzip.16	d28,d8
-
-	vmull.u32	q6,d28,d0[0]
-	vmull.u32	q7,d28,d0[1]
-	vmull.u32	q8,d28,d1[0]
-	vshl.i64	d29,d13,#16
-	vmull.u32	q9,d28,d1[1]
-
-	vadd.u64	d29,d29,d12
-	veor	d8,d8,d8
-	vmul.u32	d29,d29,d30
-
-	vmull.u32	q10,d28,d2[0]
-	vld1.32	{d4,d5,d6,d7}, [r3]!
-	vmull.u32	q11,d28,d2[1]
-	vmull.u32	q12,d28,d3[0]
-	vzip.16	d29,d8
-	vmull.u32	q13,d28,d3[1]
-
-	vmlal.u32	q6,d29,d4[0]
-	sub	r9,r5,#1
-	vmlal.u32	q7,d29,d4[1]
-	vmlal.u32	q8,d29,d5[0]
-	vmlal.u32	q9,d29,d5[1]
-
-	vmlal.u32	q10,d29,d6[0]
-	vmov	q5,q6
-	vmlal.u32	q11,d29,d6[1]
-	vmov	q6,q7
-	vmlal.u32	q12,d29,d7[0]
-	vmov	q7,q8
-	vmlal.u32	q13,d29,d7[1]
-	vmov	q8,q9
-	vmov	q9,q10
-	vshr.u64	d10,d10,#16
-	vmov	q10,q11
-	vmov	q11,q12
-	vadd.u64	d10,d10,d11
-	vmov	q12,q13
-	veor	q13,q13
-	vshr.u64	d10,d10,#16
-
-	b	LNEON_outer8
-
-.align	4
-LNEON_outer8:
-	vld1.32	{d28[0]}, [r2,:32]!
-	veor	d8,d8,d8
-	vzip.16	d28,d8
-	vadd.u64	d12,d12,d10
-
-	vmlal.u32	q6,d28,d0[0]
-	vmlal.u32	q7,d28,d0[1]
-	vmlal.u32	q8,d28,d1[0]
-	vshl.i64	d29,d13,#16
-	vmlal.u32	q9,d28,d1[1]
-
-	vadd.u64	d29,d29,d12
-	veor	d8,d8,d8
-	subs	r9,r9,#1
-	vmul.u32	d29,d29,d30
-
-	vmlal.u32	q10,d28,d2[0]
-	vmlal.u32	q11,d28,d2[1]
-	vmlal.u32	q12,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q13,d28,d3[1]
-
-	vmlal.u32	q6,d29,d4[0]
-	vmlal.u32	q7,d29,d4[1]
-	vmlal.u32	q8,d29,d5[0]
-	vmlal.u32	q9,d29,d5[1]
-
-	vmlal.u32	q10,d29,d6[0]
-	vmov	q5,q6
-	vmlal.u32	q11,d29,d6[1]
-	vmov	q6,q7
-	vmlal.u32	q12,d29,d7[0]
-	vmov	q7,q8
-	vmlal.u32	q13,d29,d7[1]
-	vmov	q8,q9
-	vmov	q9,q10
-	vshr.u64	d10,d10,#16
-	vmov	q10,q11
-	vmov	q11,q12
-	vadd.u64	d10,d10,d11
-	vmov	q12,q13
-	veor	q13,q13
-	vshr.u64	d10,d10,#16
-
-	bne	LNEON_outer8
-
-	vadd.u64	d12,d12,d10
-	mov	r7,sp
-	vshr.u64	d10,d12,#16
-	mov	r8,r5
-	vadd.u64	d13,d13,d10
-	add	r6,sp,#96
-	vshr.u64	d10,d13,#16
-	vzip.16	d12,d13
-
-	b	LNEON_tail_entry
-
-.align	4
-LNEON_8n:
-	veor	q6,q6,q6
-	sub	r7,sp,#128
-	veor	q7,q7,q7
-	sub	r7,r7,r5,lsl#4
-	veor	q8,q8,q8
-	and	r7,r7,#-64
-	veor	q9,q9,q9
-	mov	sp,r7			@ alloca
-	veor	q10,q10,q10
-	add	r7,r7,#256
-	veor	q11,q11,q11
-	sub	r8,r5,#8
-	veor	q12,q12,q12
-	veor	q13,q13,q13
-
-LNEON_8n_init:
-	vst1.64	{q6,q7},[r7,:256]!
-	subs	r8,r8,#8
-	vst1.64	{q8,q9},[r7,:256]!
-	vst1.64	{q10,q11},[r7,:256]!
-	vst1.64	{q12,q13},[r7,:256]!
-	bne	LNEON_8n_init
-
-	add	r6,sp,#256
-	vld1.32	{d0,d1,d2,d3},[r1]!
-	add	r10,sp,#8
-	vld1.32	{d30[0]},[r4,:32]
-	mov	r9,r5
-	b	LNEON_8n_outer
-
-.align	4
-LNEON_8n_outer:
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	veor	d8,d8,d8
-	vzip.16	d28,d8
-	add	r7,sp,#128
-	vld1.32	{d4,d5,d6,d7},[r3]!
-
-	vmlal.u32	q6,d28,d0[0]
-	vmlal.u32	q7,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q8,d28,d1[0]
-	vshl.i64	d29,d13,#16
-	vmlal.u32	q9,d28,d1[1]
-	vadd.u64	d29,d29,d12
-	vmlal.u32	q10,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q11,d28,d2[1]
-	vst1.32	{d28},[sp,:64]		@ put aside smashed b[8*i+0]
-	vmlal.u32	q12,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q13,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q6,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q7,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q8,d29,d5[0]
-	vshr.u64	d12,d12,#16
-	vmlal.u32	q9,d29,d5[1]
-	vmlal.u32	q10,d29,d6[0]
-	vadd.u64	d12,d12,d13
-	vmlal.u32	q11,d29,d6[1]
-	vshr.u64	d12,d12,#16
-	vmlal.u32	q12,d29,d7[0]
-	vmlal.u32	q13,d29,d7[1]
-	vadd.u64	d14,d14,d12
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+0]
-	vmlal.u32	q7,d28,d0[0]
-	vld1.64	{q6},[r6,:128]!
-	vmlal.u32	q8,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q9,d28,d1[0]
-	vshl.i64	d29,d15,#16
-	vmlal.u32	q10,d28,d1[1]
-	vadd.u64	d29,d29,d14
-	vmlal.u32	q11,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q12,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+1]
-	vmlal.u32	q13,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q6,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q7,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q8,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q9,d29,d5[0]
-	vshr.u64	d14,d14,#16
-	vmlal.u32	q10,d29,d5[1]
-	vmlal.u32	q11,d29,d6[0]
-	vadd.u64	d14,d14,d15
-	vmlal.u32	q12,d29,d6[1]
-	vshr.u64	d14,d14,#16
-	vmlal.u32	q13,d29,d7[0]
-	vmlal.u32	q6,d29,d7[1]
-	vadd.u64	d16,d16,d14
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+1]
-	vmlal.u32	q8,d28,d0[0]
-	vld1.64	{q7},[r6,:128]!
-	vmlal.u32	q9,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q10,d28,d1[0]
-	vshl.i64	d29,d17,#16
-	vmlal.u32	q11,d28,d1[1]
-	vadd.u64	d29,d29,d16
-	vmlal.u32	q12,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q13,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+2]
-	vmlal.u32	q6,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q7,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q8,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q9,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q10,d29,d5[0]
-	vshr.u64	d16,d16,#16
-	vmlal.u32	q11,d29,d5[1]
-	vmlal.u32	q12,d29,d6[0]
-	vadd.u64	d16,d16,d17
-	vmlal.u32	q13,d29,d6[1]
-	vshr.u64	d16,d16,#16
-	vmlal.u32	q6,d29,d7[0]
-	vmlal.u32	q7,d29,d7[1]
-	vadd.u64	d18,d18,d16
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+2]
-	vmlal.u32	q9,d28,d0[0]
-	vld1.64	{q8},[r6,:128]!
-	vmlal.u32	q10,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q11,d28,d1[0]
-	vshl.i64	d29,d19,#16
-	vmlal.u32	q12,d28,d1[1]
-	vadd.u64	d29,d29,d18
-	vmlal.u32	q13,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q6,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+3]
-	vmlal.u32	q7,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q8,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q9,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q10,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q11,d29,d5[0]
-	vshr.u64	d18,d18,#16
-	vmlal.u32	q12,d29,d5[1]
-	vmlal.u32	q13,d29,d6[0]
-	vadd.u64	d18,d18,d19
-	vmlal.u32	q6,d29,d6[1]
-	vshr.u64	d18,d18,#16
-	vmlal.u32	q7,d29,d7[0]
-	vmlal.u32	q8,d29,d7[1]
-	vadd.u64	d20,d20,d18
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+3]
-	vmlal.u32	q10,d28,d0[0]
-	vld1.64	{q9},[r6,:128]!
-	vmlal.u32	q11,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q12,d28,d1[0]
-	vshl.i64	d29,d21,#16
-	vmlal.u32	q13,d28,d1[1]
-	vadd.u64	d29,d29,d20
-	vmlal.u32	q6,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q7,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+4]
-	vmlal.u32	q8,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q9,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q10,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q11,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q12,d29,d5[0]
-	vshr.u64	d20,d20,#16
-	vmlal.u32	q13,d29,d5[1]
-	vmlal.u32	q6,d29,d6[0]
-	vadd.u64	d20,d20,d21
-	vmlal.u32	q7,d29,d6[1]
-	vshr.u64	d20,d20,#16
-	vmlal.u32	q8,d29,d7[0]
-	vmlal.u32	q9,d29,d7[1]
-	vadd.u64	d22,d22,d20
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+4]
-	vmlal.u32	q11,d28,d0[0]
-	vld1.64	{q10},[r6,:128]!
-	vmlal.u32	q12,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q13,d28,d1[0]
-	vshl.i64	d29,d23,#16
-	vmlal.u32	q6,d28,d1[1]
-	vadd.u64	d29,d29,d22
-	vmlal.u32	q7,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q8,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+5]
-	vmlal.u32	q9,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q10,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q11,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q12,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q13,d29,d5[0]
-	vshr.u64	d22,d22,#16
-	vmlal.u32	q6,d29,d5[1]
-	vmlal.u32	q7,d29,d6[0]
-	vadd.u64	d22,d22,d23
-	vmlal.u32	q8,d29,d6[1]
-	vshr.u64	d22,d22,#16
-	vmlal.u32	q9,d29,d7[0]
-	vmlal.u32	q10,d29,d7[1]
-	vadd.u64	d24,d24,d22
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+5]
-	vmlal.u32	q12,d28,d0[0]
-	vld1.64	{q11},[r6,:128]!
-	vmlal.u32	q13,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q6,d28,d1[0]
-	vshl.i64	d29,d25,#16
-	vmlal.u32	q7,d28,d1[1]
-	vadd.u64	d29,d29,d24
-	vmlal.u32	q8,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q9,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+6]
-	vmlal.u32	q10,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q11,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q12,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q13,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q6,d29,d5[0]
-	vshr.u64	d24,d24,#16
-	vmlal.u32	q7,d29,d5[1]
-	vmlal.u32	q8,d29,d6[0]
-	vadd.u64	d24,d24,d25
-	vmlal.u32	q9,d29,d6[1]
-	vshr.u64	d24,d24,#16
-	vmlal.u32	q10,d29,d7[0]
-	vmlal.u32	q11,d29,d7[1]
-	vadd.u64	d26,d26,d24
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+6]
-	vmlal.u32	q13,d28,d0[0]
-	vld1.64	{q12},[r6,:128]!
-	vmlal.u32	q6,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q7,d28,d1[0]
-	vshl.i64	d29,d27,#16
-	vmlal.u32	q8,d28,d1[1]
-	vadd.u64	d29,d29,d26
-	vmlal.u32	q9,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q10,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+7]
-	vmlal.u32	q11,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q12,d28,d3[1]
-	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
-	vmlal.u32	q13,d29,d4[0]
-	vld1.32	{d0,d1,d2,d3},[r1]!
-	vmlal.u32	q6,d29,d4[1]
-	vmlal.u32	q7,d29,d5[0]
-	vshr.u64	d26,d26,#16
-	vmlal.u32	q8,d29,d5[1]
-	vmlal.u32	q9,d29,d6[0]
-	vadd.u64	d26,d26,d27
-	vmlal.u32	q10,d29,d6[1]
-	vshr.u64	d26,d26,#16
-	vmlal.u32	q11,d29,d7[0]
-	vmlal.u32	q12,d29,d7[1]
-	vadd.u64	d12,d12,d26
-	vst1.32	{d29},[r10,:64]	@ put aside smashed m[8*i+7]
-	add	r10,sp,#8		@ rewind
-	sub	r8,r5,#8
-	b	LNEON_8n_inner
-
-.align	4
-LNEON_8n_inner:
-	subs	r8,r8,#8
-	vmlal.u32	q6,d28,d0[0]
-	vld1.64	{q13},[r6,:128]
-	vmlal.u32	q7,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+0]
-	vmlal.u32	q8,d28,d1[0]
-	vld1.32	{d4,d5,d6,d7},[r3]!
-	vmlal.u32	q9,d28,d1[1]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q10,d28,d2[0]
-	vmlal.u32	q11,d28,d2[1]
-	vmlal.u32	q12,d28,d3[0]
-	vmlal.u32	q13,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+1]
-	vmlal.u32	q6,d29,d4[0]
-	vmlal.u32	q7,d29,d4[1]
-	vmlal.u32	q8,d29,d5[0]
-	vmlal.u32	q9,d29,d5[1]
-	vmlal.u32	q10,d29,d6[0]
-	vmlal.u32	q11,d29,d6[1]
-	vmlal.u32	q12,d29,d7[0]
-	vmlal.u32	q13,d29,d7[1]
-	vst1.64	{q6},[r7,:128]!
-	vmlal.u32	q7,d28,d0[0]
-	vld1.64	{q6},[r6,:128]
-	vmlal.u32	q8,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+1]
-	vmlal.u32	q9,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q10,d28,d1[1]
-	vmlal.u32	q11,d28,d2[0]
-	vmlal.u32	q12,d28,d2[1]
-	vmlal.u32	q13,d28,d3[0]
-	vmlal.u32	q6,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+2]
-	vmlal.u32	q7,d29,d4[0]
-	vmlal.u32	q8,d29,d4[1]
-	vmlal.u32	q9,d29,d5[0]
-	vmlal.u32	q10,d29,d5[1]
-	vmlal.u32	q11,d29,d6[0]
-	vmlal.u32	q12,d29,d6[1]
-	vmlal.u32	q13,d29,d7[0]
-	vmlal.u32	q6,d29,d7[1]
-	vst1.64	{q7},[r7,:128]!
-	vmlal.u32	q8,d28,d0[0]
-	vld1.64	{q7},[r6,:128]
-	vmlal.u32	q9,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+2]
-	vmlal.u32	q10,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q11,d28,d1[1]
-	vmlal.u32	q12,d28,d2[0]
-	vmlal.u32	q13,d28,d2[1]
-	vmlal.u32	q6,d28,d3[0]
-	vmlal.u32	q7,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+3]
-	vmlal.u32	q8,d29,d4[0]
-	vmlal.u32	q9,d29,d4[1]
-	vmlal.u32	q10,d29,d5[0]
-	vmlal.u32	q11,d29,d5[1]
-	vmlal.u32	q12,d29,d6[0]
-	vmlal.u32	q13,d29,d6[1]
-	vmlal.u32	q6,d29,d7[0]
-	vmlal.u32	q7,d29,d7[1]
-	vst1.64	{q8},[r7,:128]!
-	vmlal.u32	q9,d28,d0[0]
-	vld1.64	{q8},[r6,:128]
-	vmlal.u32	q10,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+3]
-	vmlal.u32	q11,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q12,d28,d1[1]
-	vmlal.u32	q13,d28,d2[0]
-	vmlal.u32	q6,d28,d2[1]
-	vmlal.u32	q7,d28,d3[0]
-	vmlal.u32	q8,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+4]
-	vmlal.u32	q9,d29,d4[0]
-	vmlal.u32	q10,d29,d4[1]
-	vmlal.u32	q11,d29,d5[0]
-	vmlal.u32	q12,d29,d5[1]
-	vmlal.u32	q13,d29,d6[0]
-	vmlal.u32	q6,d29,d6[1]
-	vmlal.u32	q7,d29,d7[0]
-	vmlal.u32	q8,d29,d7[1]
-	vst1.64	{q9},[r7,:128]!
-	vmlal.u32	q10,d28,d0[0]
-	vld1.64	{q9},[r6,:128]
-	vmlal.u32	q11,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+4]
-	vmlal.u32	q12,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q13,d28,d1[1]
-	vmlal.u32	q6,d28,d2[0]
-	vmlal.u32	q7,d28,d2[1]
-	vmlal.u32	q8,d28,d3[0]
-	vmlal.u32	q9,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+5]
-	vmlal.u32	q10,d29,d4[0]
-	vmlal.u32	q11,d29,d4[1]
-	vmlal.u32	q12,d29,d5[0]
-	vmlal.u32	q13,d29,d5[1]
-	vmlal.u32	q6,d29,d6[0]
-	vmlal.u32	q7,d29,d6[1]
-	vmlal.u32	q8,d29,d7[0]
-	vmlal.u32	q9,d29,d7[1]
-	vst1.64	{q10},[r7,:128]!
-	vmlal.u32	q11,d28,d0[0]
-	vld1.64	{q10},[r6,:128]
-	vmlal.u32	q12,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+5]
-	vmlal.u32	q13,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q6,d28,d1[1]
-	vmlal.u32	q7,d28,d2[0]
-	vmlal.u32	q8,d28,d2[1]
-	vmlal.u32	q9,d28,d3[0]
-	vmlal.u32	q10,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+6]
-	vmlal.u32	q11,d29,d4[0]
-	vmlal.u32	q12,d29,d4[1]
-	vmlal.u32	q13,d29,d5[0]
-	vmlal.u32	q6,d29,d5[1]
-	vmlal.u32	q7,d29,d6[0]
-	vmlal.u32	q8,d29,d6[1]
-	vmlal.u32	q9,d29,d7[0]
-	vmlal.u32	q10,d29,d7[1]
-	vst1.64	{q11},[r7,:128]!
-	vmlal.u32	q12,d28,d0[0]
-	vld1.64	{q11},[r6,:128]
-	vmlal.u32	q13,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+6]
-	vmlal.u32	q6,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q7,d28,d1[1]
-	vmlal.u32	q8,d28,d2[0]
-	vmlal.u32	q9,d28,d2[1]
-	vmlal.u32	q10,d28,d3[0]
-	vmlal.u32	q11,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+7]
-	vmlal.u32	q12,d29,d4[0]
-	vmlal.u32	q13,d29,d4[1]
-	vmlal.u32	q6,d29,d5[0]
-	vmlal.u32	q7,d29,d5[1]
-	vmlal.u32	q8,d29,d6[0]
-	vmlal.u32	q9,d29,d6[1]
-	vmlal.u32	q10,d29,d7[0]
-	vmlal.u32	q11,d29,d7[1]
-	vst1.64	{q12},[r7,:128]!
-	vmlal.u32	q13,d28,d0[0]
-	vld1.64	{q12},[r6,:128]
-	vmlal.u32	q6,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+7]
-	vmlal.u32	q7,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q8,d28,d1[1]
-	vmlal.u32	q9,d28,d2[0]
-	vmlal.u32	q10,d28,d2[1]
-	vmlal.u32	q11,d28,d3[0]
-	vmlal.u32	q12,d28,d3[1]
-	it	eq
-	subeq	r1,r1,r5,lsl#2	@ rewind
-	vmlal.u32	q13,d29,d4[0]
-	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
-	vmlal.u32	q6,d29,d4[1]
-	vld1.32	{d0,d1,d2,d3},[r1]!
-	vmlal.u32	q7,d29,d5[0]
-	add	r10,sp,#8		@ rewind
-	vmlal.u32	q8,d29,d5[1]
-	vmlal.u32	q9,d29,d6[0]
-	vmlal.u32	q10,d29,d6[1]
-	vmlal.u32	q11,d29,d7[0]
-	vst1.64	{q13},[r7,:128]!
-	vmlal.u32	q12,d29,d7[1]
-
-	bne	LNEON_8n_inner
-	add	r6,sp,#128
-	vst1.64	{q6,q7},[r7,:256]!
-	veor	q2,q2,q2		@ d4-d5
-	vst1.64	{q8,q9},[r7,:256]!
-	veor	q3,q3,q3		@ d6-d7
-	vst1.64	{q10,q11},[r7,:256]!
-	vst1.64	{q12},[r7,:128]
-
-	subs	r9,r9,#8
-	vld1.64	{q6,q7},[r6,:256]!
-	vld1.64	{q8,q9},[r6,:256]!
-	vld1.64	{q10,q11},[r6,:256]!
-	vld1.64	{q12,q13},[r6,:256]!
-
-	itt	ne
-	subne	r3,r3,r5,lsl#2	@ rewind
-	bne	LNEON_8n_outer
-
-	add	r7,sp,#128
-	vst1.64	{q2,q3}, [sp,:256]!	@ start wiping stack frame
-	vshr.u64	d10,d12,#16
-	vst1.64	{q2,q3},[sp,:256]!
-	vadd.u64	d13,d13,d10
-	vst1.64	{q2,q3}, [sp,:256]!
-	vshr.u64	d10,d13,#16
-	vst1.64	{q2,q3}, [sp,:256]!
-	vzip.16	d12,d13
-
-	mov	r8,r5
-	b	LNEON_tail_entry
-
-.align	4
-LNEON_tail:
-	vadd.u64	d12,d12,d10
-	vshr.u64	d10,d12,#16
-	vld1.64	{q8,q9}, [r6, :256]!
-	vadd.u64	d13,d13,d10
-	vld1.64	{q10,q11}, [r6, :256]!
-	vshr.u64	d10,d13,#16
-	vld1.64	{q12,q13}, [r6, :256]!
-	vzip.16	d12,d13
-
-LNEON_tail_entry:
-	vadd.u64	d14,d14,d10
-	vst1.32	{d12[0]}, [r7, :32]!
-	vshr.u64	d10,d14,#16
-	vadd.u64	d15,d15,d10
-	vshr.u64	d10,d15,#16
-	vzip.16	d14,d15
-	vadd.u64	d16,d16,d10
-	vst1.32	{d14[0]}, [r7, :32]!
-	vshr.u64	d10,d16,#16
-	vadd.u64	d17,d17,d10
-	vshr.u64	d10,d17,#16
-	vzip.16	d16,d17
-	vadd.u64	d18,d18,d10
-	vst1.32	{d16[0]}, [r7, :32]!
-	vshr.u64	d10,d18,#16
-	vadd.u64	d19,d19,d10
-	vshr.u64	d10,d19,#16
-	vzip.16	d18,d19
-	vadd.u64	d20,d20,d10
-	vst1.32	{d18[0]}, [r7, :32]!
-	vshr.u64	d10,d20,#16
-	vadd.u64	d21,d21,d10
-	vshr.u64	d10,d21,#16
-	vzip.16	d20,d21
-	vadd.u64	d22,d22,d10
-	vst1.32	{d20[0]}, [r7, :32]!
-	vshr.u64	d10,d22,#16
-	vadd.u64	d23,d23,d10
-	vshr.u64	d10,d23,#16
-	vzip.16	d22,d23
-	vadd.u64	d24,d24,d10
-	vst1.32	{d22[0]}, [r7, :32]!
-	vshr.u64	d10,d24,#16
-	vadd.u64	d25,d25,d10
-	vshr.u64	d10,d25,#16
-	vzip.16	d24,d25
-	vadd.u64	d26,d26,d10
-	vst1.32	{d24[0]}, [r7, :32]!
-	vshr.u64	d10,d26,#16
-	vadd.u64	d27,d27,d10
-	vshr.u64	d10,d27,#16
-	vzip.16	d26,d27
-	vld1.64	{q6,q7}, [r6, :256]!
-	subs	r8,r8,#8
-	vst1.32	{d26[0]},   [r7, :32]!
-	bne	LNEON_tail
-
-	vst1.32	{d10[0]}, [r7, :32]		@ top-most bit
-	sub	r3,r3,r5,lsl#2			@ rewind r3
-	subs	r1,sp,#0				@ clear carry flag
-	add	r2,sp,r5,lsl#2
-
-LNEON_sub:
-	ldmia	r1!, {r4,r5,r6,r7}
-	ldmia	r3!, {r8,r9,r10,r11}
-	sbcs	r8, r4,r8
-	sbcs	r9, r5,r9
-	sbcs	r10,r6,r10
-	sbcs	r11,r7,r11
-	teq	r1,r2				@ preserves carry
-	stmia	r0!, {r8,r9,r10,r11}
-	bne	LNEON_sub
-
-	ldr	r10, [r1]				@ load top-most bit
-	mov	r11,sp
-	veor	q0,q0,q0
-	sub	r11,r2,r11				@ this is num*4
-	veor	q1,q1,q1
-	mov	r1,sp
-	sub	r0,r0,r11				@ rewind r0
-	mov	r3,r2				@ second 3/4th of frame
-	sbcs	r10,r10,#0				@ result is carry flag
-
-LNEON_copy_n_zap:
-	ldmia	r1!, {r4,r5,r6,r7}
-	ldmia	r0,  {r8,r9,r10,r11}
-	it	cc
-	movcc	r8, r4
-	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
-	itt	cc
-	movcc	r9, r5
-	movcc	r10,r6
-	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
-	it	cc
-	movcc	r11,r7
-	ldmia	r1, {r4,r5,r6,r7}
-	stmia	r0!, {r8,r9,r10,r11}
-	sub	r1,r1,#16
-	ldmia	r0, {r8,r9,r10,r11}
-	it	cc
-	movcc	r8, r4
-	vst1.64	{q0,q1}, [r1,:256]!			@ wipe
-	itt	cc
-	movcc	r9, r5
-	movcc	r10,r6
-	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
-	it	cc
-	movcc	r11,r7
-	teq	r1,r2				@ preserves carry
-	stmia	r0!, {r8,r9,r10,r11}
-	bne	LNEON_copy_n_zap
-
-	mov	sp,ip
-	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
-	bx	lr						@ bx lr
-
-#endif
-.byte	77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
-#endif  // defined(__arm__) && defined(__APPLE__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm.c
new file mode 100644
index 00000000..7cc13342
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm.c
@@ -0,0 +1,277 @@
+/* Copyright (c) 2017, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#if !defined(_GNU_SOURCE)
+#define _GNU_SOURCE  // needed for syscall() on Linux.
+#endif
+
+#include <CCryptoBoringSSL_crypto.h>
+
+#include <stdlib.h>
+#if defined(BORINGSSL_FIPS)
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+#include <CCryptoBoringSSL_digest.h>
+#include <CCryptoBoringSSL_hmac.h>
+#include <CCryptoBoringSSL_sha.h>
+
+#include "bcm_interface.h"
+#include "../internal.h"
+
+// TODO(crbug.com/362530616): When delocate is removed, build these files as
+// separate compilation units again.
+#include "aes/aes.c.inc"
+#include "aes/aes_nohw.c.inc"
+#include "aes/key_wrap.c.inc"
+#include "aes/mode_wrappers.c.inc"
+#include "bn/add.c.inc"
+#include "bn/asm/x86_64-gcc.c.inc"
+#include "bn/bn.c.inc"
+#include "bn/bytes.c.inc"
+#include "bn/cmp.c.inc"
+#include "bn/ctx.c.inc"
+#include "bn/div.c.inc"
+#include "bn/div_extra.c.inc"
+#include "bn/exponentiation.c.inc"
+#include "bn/gcd.c.inc"
+#include "bn/gcd_extra.c.inc"
+#include "bn/generic.c.inc"
+#include "bn/jacobi.c.inc"
+#include "bn/montgomery.c.inc"
+#include "bn/montgomery_inv.c.inc"
+#include "bn/mul.c.inc"
+#include "bn/prime.c.inc"
+#include "bn/random.c.inc"
+#include "bn/rsaz_exp.c.inc"
+#include "bn/shift.c.inc"
+#include "bn/sqrt.c.inc"
+#include "cipher/aead.c.inc"
+#include "cipher/cipher.c.inc"
+#include "cipher/e_aes.c.inc"
+#include "cipher/e_aesccm.c.inc"
+#include "cmac/cmac.c.inc"
+#include "dh/check.c.inc"
+#include "dh/dh.c.inc"
+#include "digest/digest.c.inc"
+#include "digest/digests.c.inc"
+#include "digestsign/digestsign.c.inc"
+#include "ecdh/ecdh.c.inc"
+#include "ecdsa/ecdsa.c.inc"
+#include "ec/ec.c.inc"
+#include "ec/ec_key.c.inc"
+#include "ec/ec_montgomery.c.inc"
+#include "ec/felem.c.inc"
+#include "ec/oct.c.inc"
+#include "ec/p224-64.c.inc"
+#include "ec/p256.c.inc"
+#include "ec/p256-nistz.c.inc"
+#include "ec/scalar.c.inc"
+#include "ec/simple.c.inc"
+#include "ec/simple_mul.c.inc"
+#include "ec/util.c.inc"
+#include "ec/wnaf.c.inc"
+#include "hkdf/hkdf.c.inc"
+#include "hmac/hmac.c.inc"
+#include "modes/cbc.c.inc"
+#include "modes/cfb.c.inc"
+#include "modes/ctr.c.inc"
+#include "modes/gcm.c.inc"
+#include "modes/gcm_nohw.c.inc"
+#include "modes/ofb.c.inc"
+#include "modes/polyval.c.inc"
+#include "rand/ctrdrbg.c.inc"
+#include "rand/rand.c.inc"
+#include "rsa/blinding.c.inc"
+#include "rsa/padding.c.inc"
+#include "rsa/rsa.c.inc"
+#include "rsa/rsa_impl.c.inc"
+#include "self_check/fips.c.inc"
+#include "self_check/self_check.c.inc"
+#include "service_indicator/service_indicator.c.inc"
+#include "sha/sha1.c.inc"
+#include "sha/sha256.c.inc"
+#include "sha/sha512.c.inc"
+#include "tls/kdf.c.inc"
+
+
+#if defined(BORINGSSL_FIPS)
+
+#if !defined(OPENSSL_ASAN)
+
+// These symbols are filled in by delocate.go (in static builds) or a linker
+// script (in shared builds). They point to the start and end of the module, and
+// the location of the integrity hash, respectively.
+extern const uint8_t BORINGSSL_bcm_text_start[];
+extern const uint8_t BORINGSSL_bcm_text_end[];
+extern const uint8_t BORINGSSL_bcm_text_hash[];
+#if defined(BORINGSSL_SHARED_LIBRARY)
+extern const uint8_t BORINGSSL_bcm_rodata_start[];
+extern const uint8_t BORINGSSL_bcm_rodata_end[];
+#endif
+
+// assert_within is used to sanity check that certain symbols are within the
+// bounds of the integrity check. It checks that start <= symbol < end and
+// aborts otherwise.
+static void assert_within(const void *start, const void *symbol,
+                          const void *end) {
+  const uintptr_t start_val = (uintptr_t) start;
+  const uintptr_t symbol_val = (uintptr_t) symbol;
+  const uintptr_t end_val = (uintptr_t) end;
+
+  if (start_val <= symbol_val && symbol_val < end_val) {
+    return;
+  }
+
+  fprintf(
+      stderr,
+      "FIPS module doesn't span expected symbol. Expected %p <= %p < %p\n",
+      start, symbol, end);
+  BORINGSSL_FIPS_abort();
+}
+
+#if defined(OPENSSL_ANDROID) && defined(OPENSSL_AARCH64)
+static void BORINGSSL_maybe_set_module_text_permissions(int permission) {
+  // Android may be compiled in execute-only-memory mode, in which case the
+  // .text segment cannot be read. That conflicts with the need for a FIPS
+  // module to hash its own contents, therefore |mprotect| is used to make
+  // the module's .text readable for the duration of the hashing process. In
+  // other build configurations this is a no-op.
+  const uintptr_t page_size = getpagesize();
+  const uintptr_t page_start =
+      ((uintptr_t)BORINGSSL_bcm_text_start) & ~(page_size - 1);
+
+  if (mprotect((void *)page_start,
+               ((uintptr_t)BORINGSSL_bcm_text_end) - page_start,
+               permission) != 0) {
+    perror("BoringSSL: mprotect");
+  }
+}
+#else
+static void BORINGSSL_maybe_set_module_text_permissions(int permission) {}
+#endif  // !ANDROID
+
+#endif  // !ASAN
+
+static void __attribute__((constructor))
+BORINGSSL_bcm_power_on_self_test(void) {
+#if !defined(OPENSSL_ASAN)
+  // Integrity tests cannot run under ASAN because it involves reading the full
+  // .text section, which triggers the global-buffer overflow detection.
+  if (!BORINGSSL_integrity_test()) {
+    goto err;
+  }
+#endif  // OPENSSL_ASAN
+
+  if (!boringssl_self_test_startup()) {
+    goto err;
+  }
+
+  return;
+
+err:
+  BORINGSSL_FIPS_abort();
+}
+
+#if !defined(OPENSSL_ASAN)
+int BORINGSSL_integrity_test(void) {
+  const uint8_t *const start = BORINGSSL_bcm_text_start;
+  const uint8_t *const end = BORINGSSL_bcm_text_end;
+
+  assert_within(start, AES_encrypt, end);
+  assert_within(start, RSA_sign, end);
+  assert_within(start, BCM_rand_bytes, end);
+  assert_within(start, EC_GROUP_cmp, end);
+  assert_within(start, SHA256_Update, end);
+  assert_within(start, ecdsa_verify_fixed, end);
+  assert_within(start, EVP_AEAD_CTX_seal, end);
+
+#if defined(BORINGSSL_SHARED_LIBRARY)
+  const uint8_t *const rodata_start = BORINGSSL_bcm_rodata_start;
+  const uint8_t *const rodata_end = BORINGSSL_bcm_rodata_end;
+#else
+  // In the static build, read-only data is placed within the .text segment.
+  const uint8_t *const rodata_start = BORINGSSL_bcm_text_start;
+  const uint8_t *const rodata_end = BORINGSSL_bcm_text_end;
+#endif
+
+  assert_within(rodata_start, kPrimes, rodata_end);
+  assert_within(rodata_start, kP256Field, rodata_end);
+  assert_within(rodata_start, kPKCS1SigPrefixes, rodata_end);
+
+  uint8_t result[SHA256_DIGEST_LENGTH];
+  const EVP_MD *const kHashFunction = EVP_sha256();
+  if (!boringssl_self_test_sha256() ||
+      !boringssl_self_test_hmac_sha256()) {
+    return 0;
+  }
+
+  static const uint8_t kHMACKey[64] = {0};
+  unsigned result_len;
+  HMAC_CTX hmac_ctx;
+  HMAC_CTX_init(&hmac_ctx);
+  if (!HMAC_Init_ex(&hmac_ctx, kHMACKey, sizeof(kHMACKey), kHashFunction,
+                    NULL /* no ENGINE */)) {
+    fprintf(stderr, "HMAC_Init_ex failed.\n");
+    return 0;
+  }
+
+  BORINGSSL_maybe_set_module_text_permissions(PROT_READ | PROT_EXEC);
+#if defined(BORINGSSL_SHARED_LIBRARY)
+  uint64_t length = end - start;
+  HMAC_Update(&hmac_ctx, (const uint8_t *) &length, sizeof(length));
+  HMAC_Update(&hmac_ctx, start, length);
+
+  length = rodata_end - rodata_start;
+  HMAC_Update(&hmac_ctx, (const uint8_t *) &length, sizeof(length));
+  HMAC_Update(&hmac_ctx, rodata_start, length);
+#else
+  HMAC_Update(&hmac_ctx, start, end - start);
+#endif
+  BORINGSSL_maybe_set_module_text_permissions(PROT_EXEC);
+
+  if (!HMAC_Final(&hmac_ctx, result, &result_len) ||
+      result_len != sizeof(result)) {
+    fprintf(stderr, "HMAC failed.\n");
+    return 0;
+  }
+  HMAC_CTX_cleanse(&hmac_ctx); // FIPS 140-3, AS05.10.
+
+  const uint8_t *expected = BORINGSSL_bcm_text_hash;
+
+  if (!check_test(expected, result, sizeof(result), "FIPS integrity test")) {
+#if !defined(BORINGSSL_FIPS_BREAK_TESTS)
+    return 0;
+#endif
+  }
+
+  OPENSSL_cleanse(result, sizeof(result)); // FIPS 140-3, AS05.10.
+  return 1;
+}
+
+const uint8_t* FIPS_module_hash(void) {
+  return BORINGSSL_bcm_text_hash;
+}
+
+#endif  // OPENSSL_ASAN
+
+void BORINGSSL_FIPS_abort(void) {
+  for (;;) {
+    abort();
+    exit(1);
+  }
+}
+
+#endif  // BORINGSSL_FIPS
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm_interface.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm_interface.h
new file mode 100644
index 00000000..b5461595
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bcm_interface.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2024, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_CRYPTO_BCM_INTERFACE_H
+#define OPENSSL_HEADER_CRYPTO_BCM_INTERFACE_H
+
+#include <CCryptoBoringSSL_bcm_public.h>
+
+// This header will eventually become the interface between BCM and the
+// rest of libcrypto. More cleanly separating the two is still a work in
+// progress (see https://crbug.com/boringssl/722) so, at the moment, we
+// consider this no different from any other header in BCM.
+//
+// Over time, calls from libcrypto to BCM will all move to this header
+// and the separation will become more meaningful.
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Enumerated types for return values from bcm functions, both infallible
+// and fallible functions. Two success values are used to correspond to the
+// FIPS service indicator. For the moment, the official service indicator
+// remains the counter, not these values. Once we fully transition to
+// these return values from bcm we will change that.
+enum bcm_infallible_t {
+  bcm_infallible_approved,
+  bcm_infallible_not_approved,
+};
+
+enum bcm_status_t {
+  bcm_status_approved,
+  bcm_status_not_approved,
+
+  // Failure codes, which must all be negative.
+  bcm_status_failure,
+};
+typedef enum bcm_status_t bcm_status;
+typedef enum bcm_infallible_t bcm_infallible;
+
+OPENSSL_INLINE int bcm_success(bcm_status status) {
+  return status == bcm_status_approved || status == bcm_status_not_approved;
+}
+
+
+// Random number generator.
+
+#if defined(BORINGSSL_FIPS)
+
+// We overread from /dev/urandom or RDRAND by a factor of 10 and XOR to whiten.
+// TODO(bbe): disentangle this value which is used to calculate the size of the
+// stack buffer in RAND_need entropy based on a calculation.
+#define BORINGSSL_FIPS_OVERREAD 10
+
+#endif  // BORINGSSL_FIPS
+
+// BCM_rand_load_entropy supplies |entropy_len| bytes of entropy to the BCM
+// module. The |want_additional_input| parameter is true iff the entropy was
+// obtained from a source other than the system, e.g. directly from the CPU.
+bcm_infallible BCM_rand_load_entropy(const uint8_t *entropy, size_t entropy_len,
+                                     int want_additional_input);
+
+// BCM_rand_bytes is the same as the public |RAND_bytes| function, other
+// than returning a bcm_infallible status indicator.
+OPENSSL_EXPORT bcm_infallible BCM_rand_bytes(uint8_t *out, size_t out_len);
+
+// BCM_rand_bytes_hwrng attempts to fill |out| with |len| bytes of entropy from
+// the CPU hardware random number generator if one is present.
+// bcm_status_approved is returned on success, and a failure status is
+// returned otherwise.
+bcm_status BCM_rand_bytes_hwrng(uint8_t *out, size_t len);
+
+// BCM_rand_bytes_with_additional_data samples from the RNG after mixing 32
+// bytes from |user_additional_data| in.
+bcm_infallible BCM_rand_bytes_with_additional_data(
+    uint8_t *out, size_t out_len, const uint8_t user_additional_data[32]);
+
+
+// SHA-1
+
+// BCM_SHA_DIGEST_LENGTH is the length of a SHA-1 digest.
+#define BCM_SHA_DIGEST_LENGTH 20
+
+// BCM_sha1_init initialises |sha|.
+bcm_infallible BCM_sha1_init(SHA_CTX *sha);
+
+// BCM_SHA1_transform is a low-level function that performs a single, SHA-1
+// block transformation using the state from |sha| and |SHA_CBLOCK| bytes from
+// |block|.
+bcm_infallible BCM_sha1_transform(SHA_CTX *c, const uint8_t data[BCM_SHA_CBLOCK]);
+
+// BCM_sha1_update adds |len| bytes from |data| to |sha|.
+bcm_infallible BCM_sha1_update(SHA_CTX *c, const void *data, size_t len);
+
+// BCM_sha1_final adds the final padding to |sha| and writes the resulting
+// digest to |out|, which must have at least |SHA_DIGEST_LENGTH| bytes of space.
+bcm_infallible BCM_sha1_final(uint8_t out[BCM_SHA_DIGEST_LENGTH], SHA_CTX *c);
+
+
+// BCM_fips_186_2_prf derives |out_len| bytes from |xkey| using the PRF
+// defined in FIPS 186-2, Appendix 3.1, with change notice 1 applied. The b
+// parameter is 160 and seed, XKEY, is also 160 bits. The optional XSEED user
+// input is all zeros.
+//
+// The PRF generates a sequence of 320-bit numbers. Each number is encoded as a
+// 40-byte string in big-endian and then concatenated to form |out|. If
+// |out_len| is not a multiple of 40, the result is truncated. This matches the
+// construction used in Section 7 of RFC 4186 and Section 7 of RFC 4187.
+//
+// This PRF is based on SHA-1, a weak hash function, and should not be used
+// in new protocols. It is provided for compatibility with some legacy EAP
+// methods.
+bcm_infallible BCM_fips_186_2_prf(uint8_t *out, size_t out_len,
+                                  const uint8_t xkey[BCM_SHA_DIGEST_LENGTH]);
+
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
+
+#endif  // OPENSSL_HEADER_CRYPTO_BCM_INTERFACE_H
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-windows.windows.x86.S
deleted file mode 100644
index d700276e..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-windows.windows.x86.S
+++ /dev/null
@@ -1,989 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-;extern	_OPENSSL_ia32cap_P
-global	_bn_mul_add_words
-align	16
-_bn_mul_add_words:
-L$_bn_mul_add_words_begin:
-	lea	eax,[_OPENSSL_ia32cap_P]
-	bt	DWORD [eax],26
-	jnc	NEAR L$000maw_non_sse2
-	mov	eax,DWORD [4+esp]
-	mov	edx,DWORD [8+esp]
-	mov	ecx,DWORD [12+esp]
-	movd	mm0,DWORD [16+esp]
-	pxor	mm1,mm1
-	jmp	NEAR L$001maw_sse2_entry
-align	16
-L$002maw_sse2_unrolled:
-	movd	mm3,DWORD [eax]
-	paddq	mm1,mm3
-	movd	mm2,DWORD [edx]
-	pmuludq	mm2,mm0
-	movd	mm4,DWORD [4+edx]
-	pmuludq	mm4,mm0
-	movd	mm6,DWORD [8+edx]
-	pmuludq	mm6,mm0
-	movd	mm7,DWORD [12+edx]
-	pmuludq	mm7,mm0
-	paddq	mm1,mm2
-	movd	mm3,DWORD [4+eax]
-	paddq	mm3,mm4
-	movd	mm5,DWORD [8+eax]
-	paddq	mm5,mm6
-	movd	mm4,DWORD [12+eax]
-	paddq	mm7,mm4
-	movd	DWORD [eax],mm1
-	movd	mm2,DWORD [16+edx]
-	pmuludq	mm2,mm0
-	psrlq	mm1,32
-	movd	mm4,DWORD [20+edx]
-	pmuludq	mm4,mm0
-	paddq	mm1,mm3
-	movd	mm6,DWORD [24+edx]
-	pmuludq	mm6,mm0
-	movd	DWORD [4+eax],mm1
-	psrlq	mm1,32
-	movd	mm3,DWORD [28+edx]
-	add	edx,32
-	pmuludq	mm3,mm0
-	paddq	mm1,mm5
-	movd	mm5,DWORD [16+eax]
-	paddq	mm2,mm5
-	movd	DWORD [8+eax],mm1
-	psrlq	mm1,32
-	paddq	mm1,mm7
-	movd	mm5,DWORD [20+eax]
-	paddq	mm4,mm5
-	movd	DWORD [12+eax],mm1
-	psrlq	mm1,32
-	paddq	mm1,mm2
-	movd	mm5,DWORD [24+eax]
-	paddq	mm6,mm5
-	movd	DWORD [16+eax],mm1
-	psrlq	mm1,32
-	paddq	mm1,mm4
-	movd	mm5,DWORD [28+eax]
-	paddq	mm3,mm5
-	movd	DWORD [20+eax],mm1
-	psrlq	mm1,32
-	paddq	mm1,mm6
-	movd	DWORD [24+eax],mm1
-	psrlq	mm1,32
-	paddq	mm1,mm3
-	movd	DWORD [28+eax],mm1
-	lea	eax,[32+eax]
-	psrlq	mm1,32
-	sub	ecx,8
-	jz	NEAR L$003maw_sse2_exit
-L$001maw_sse2_entry:
-	test	ecx,4294967288
-	jnz	NEAR L$002maw_sse2_unrolled
-align	4
-L$004maw_sse2_loop:
-	movd	mm2,DWORD [edx]
-	movd	mm3,DWORD [eax]
-	pmuludq	mm2,mm0
-	lea	edx,[4+edx]
-	paddq	mm1,mm3
-	paddq	mm1,mm2
-	movd	DWORD [eax],mm1
-	sub	ecx,1
-	psrlq	mm1,32
-	lea	eax,[4+eax]
-	jnz	NEAR L$004maw_sse2_loop
-L$003maw_sse2_exit:
-	movd	eax,mm1
-	emms
-	ret
-align	16
-L$000maw_non_sse2:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	; 
-	xor	esi,esi
-	mov	edi,DWORD [20+esp]
-	mov	ecx,DWORD [28+esp]
-	mov	ebx,DWORD [24+esp]
-	and	ecx,4294967288
-	mov	ebp,DWORD [32+esp]
-	push	ecx
-	jz	NEAR L$005maw_finish
-align	16
-L$006maw_loop:
-	; Round 0
-	mov	eax,DWORD [ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [edi]
-	adc	edx,0
-	mov	DWORD [edi],eax
-	mov	esi,edx
-	; Round 4
-	mov	eax,DWORD [4+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [4+edi]
-	adc	edx,0
-	mov	DWORD [4+edi],eax
-	mov	esi,edx
-	; Round 8
-	mov	eax,DWORD [8+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [8+edi]
-	adc	edx,0
-	mov	DWORD [8+edi],eax
-	mov	esi,edx
-	; Round 12
-	mov	eax,DWORD [12+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [12+edi]
-	adc	edx,0
-	mov	DWORD [12+edi],eax
-	mov	esi,edx
-	; Round 16
-	mov	eax,DWORD [16+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [16+edi]
-	adc	edx,0
-	mov	DWORD [16+edi],eax
-	mov	esi,edx
-	; Round 20
-	mov	eax,DWORD [20+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [20+edi]
-	adc	edx,0
-	mov	DWORD [20+edi],eax
-	mov	esi,edx
-	; Round 24
-	mov	eax,DWORD [24+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [24+edi]
-	adc	edx,0
-	mov	DWORD [24+edi],eax
-	mov	esi,edx
-	; Round 28
-	mov	eax,DWORD [28+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [28+edi]
-	adc	edx,0
-	mov	DWORD [28+edi],eax
-	mov	esi,edx
-	; 
-	sub	ecx,8
-	lea	ebx,[32+ebx]
-	lea	edi,[32+edi]
-	jnz	NEAR L$006maw_loop
-L$005maw_finish:
-	mov	ecx,DWORD [32+esp]
-	and	ecx,7
-	jnz	NEAR L$007maw_finish2
-	jmp	NEAR L$008maw_end
-L$007maw_finish2:
-	; Tail Round 0
-	mov	eax,DWORD [ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 1
-	mov	eax,DWORD [4+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [4+edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [4+edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 2
-	mov	eax,DWORD [8+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [8+edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [8+edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 3
-	mov	eax,DWORD [12+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [12+edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [12+edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 4
-	mov	eax,DWORD [16+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [16+edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [16+edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 5
-	mov	eax,DWORD [20+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [20+edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [20+edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 6
-	mov	eax,DWORD [24+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [24+edi]
-	adc	edx,0
-	mov	DWORD [24+edi],eax
-	mov	esi,edx
-L$008maw_end:
-	mov	eax,esi
-	pop	ecx
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_bn_mul_words
-align	16
-_bn_mul_words:
-L$_bn_mul_words_begin:
-	lea	eax,[_OPENSSL_ia32cap_P]
-	bt	DWORD [eax],26
-	jnc	NEAR L$009mw_non_sse2
-	mov	eax,DWORD [4+esp]
-	mov	edx,DWORD [8+esp]
-	mov	ecx,DWORD [12+esp]
-	movd	mm0,DWORD [16+esp]
-	pxor	mm1,mm1
-align	16
-L$010mw_sse2_loop:
-	movd	mm2,DWORD [edx]
-	pmuludq	mm2,mm0
-	lea	edx,[4+edx]
-	paddq	mm1,mm2
-	movd	DWORD [eax],mm1
-	sub	ecx,1
-	psrlq	mm1,32
-	lea	eax,[4+eax]
-	jnz	NEAR L$010mw_sse2_loop
-	movd	eax,mm1
-	emms
-	ret
-align	16
-L$009mw_non_sse2:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	; 
-	xor	esi,esi
-	mov	edi,DWORD [20+esp]
-	mov	ebx,DWORD [24+esp]
-	mov	ebp,DWORD [28+esp]
-	mov	ecx,DWORD [32+esp]
-	and	ebp,4294967288
-	jz	NEAR L$011mw_finish
-L$012mw_loop:
-	; Round 0
-	mov	eax,DWORD [ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [edi],eax
-	mov	esi,edx
-	; Round 4
-	mov	eax,DWORD [4+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [4+edi],eax
-	mov	esi,edx
-	; Round 8
-	mov	eax,DWORD [8+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [8+edi],eax
-	mov	esi,edx
-	; Round 12
-	mov	eax,DWORD [12+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [12+edi],eax
-	mov	esi,edx
-	; Round 16
-	mov	eax,DWORD [16+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [16+edi],eax
-	mov	esi,edx
-	; Round 20
-	mov	eax,DWORD [20+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [20+edi],eax
-	mov	esi,edx
-	; Round 24
-	mov	eax,DWORD [24+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [24+edi],eax
-	mov	esi,edx
-	; Round 28
-	mov	eax,DWORD [28+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [28+edi],eax
-	mov	esi,edx
-	; 
-	add	ebx,32
-	add	edi,32
-	sub	ebp,8
-	jz	NEAR L$011mw_finish
-	jmp	NEAR L$012mw_loop
-L$011mw_finish:
-	mov	ebp,DWORD [28+esp]
-	and	ebp,7
-	jnz	NEAR L$013mw_finish2
-	jmp	NEAR L$014mw_end
-L$013mw_finish2:
-	; Tail Round 0
-	mov	eax,DWORD [ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 1
-	mov	eax,DWORD [4+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [4+edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 2
-	mov	eax,DWORD [8+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [8+edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 3
-	mov	eax,DWORD [12+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [12+edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 4
-	mov	eax,DWORD [16+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [16+edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 5
-	mov	eax,DWORD [20+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [20+edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 6
-	mov	eax,DWORD [24+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [24+edi],eax
-	mov	esi,edx
-L$014mw_end:
-	mov	eax,esi
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_bn_sqr_words
-align	16
-_bn_sqr_words:
-L$_bn_sqr_words_begin:
-	lea	eax,[_OPENSSL_ia32cap_P]
-	bt	DWORD [eax],26
-	jnc	NEAR L$015sqr_non_sse2
-	mov	eax,DWORD [4+esp]
-	mov	edx,DWORD [8+esp]
-	mov	ecx,DWORD [12+esp]
-align	16
-L$016sqr_sse2_loop:
-	movd	mm0,DWORD [edx]
-	pmuludq	mm0,mm0
-	lea	edx,[4+edx]
-	movq	[eax],mm0
-	sub	ecx,1
-	lea	eax,[8+eax]
-	jnz	NEAR L$016sqr_sse2_loop
-	emms
-	ret
-align	16
-L$015sqr_non_sse2:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	; 
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	ebx,DWORD [28+esp]
-	and	ebx,4294967288
-	jz	NEAR L$017sw_finish
-L$018sw_loop:
-	; Round 0
-	mov	eax,DWORD [edi]
-	mul	eax
-	mov	DWORD [esi],eax
-	mov	DWORD [4+esi],edx
-	; Round 4
-	mov	eax,DWORD [4+edi]
-	mul	eax
-	mov	DWORD [8+esi],eax
-	mov	DWORD [12+esi],edx
-	; Round 8
-	mov	eax,DWORD [8+edi]
-	mul	eax
-	mov	DWORD [16+esi],eax
-	mov	DWORD [20+esi],edx
-	; Round 12
-	mov	eax,DWORD [12+edi]
-	mul	eax
-	mov	DWORD [24+esi],eax
-	mov	DWORD [28+esi],edx
-	; Round 16
-	mov	eax,DWORD [16+edi]
-	mul	eax
-	mov	DWORD [32+esi],eax
-	mov	DWORD [36+esi],edx
-	; Round 20
-	mov	eax,DWORD [20+edi]
-	mul	eax
-	mov	DWORD [40+esi],eax
-	mov	DWORD [44+esi],edx
-	; Round 24
-	mov	eax,DWORD [24+edi]
-	mul	eax
-	mov	DWORD [48+esi],eax
-	mov	DWORD [52+esi],edx
-	; Round 28
-	mov	eax,DWORD [28+edi]
-	mul	eax
-	mov	DWORD [56+esi],eax
-	mov	DWORD [60+esi],edx
-	; 
-	add	edi,32
-	add	esi,64
-	sub	ebx,8
-	jnz	NEAR L$018sw_loop
-L$017sw_finish:
-	mov	ebx,DWORD [28+esp]
-	and	ebx,7
-	jz	NEAR L$019sw_end
-	; Tail Round 0
-	mov	eax,DWORD [edi]
-	mul	eax
-	mov	DWORD [esi],eax
-	dec	ebx
-	mov	DWORD [4+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 1
-	mov	eax,DWORD [4+edi]
-	mul	eax
-	mov	DWORD [8+esi],eax
-	dec	ebx
-	mov	DWORD [12+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 2
-	mov	eax,DWORD [8+edi]
-	mul	eax
-	mov	DWORD [16+esi],eax
-	dec	ebx
-	mov	DWORD [20+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 3
-	mov	eax,DWORD [12+edi]
-	mul	eax
-	mov	DWORD [24+esi],eax
-	dec	ebx
-	mov	DWORD [28+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 4
-	mov	eax,DWORD [16+edi]
-	mul	eax
-	mov	DWORD [32+esi],eax
-	dec	ebx
-	mov	DWORD [36+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 5
-	mov	eax,DWORD [20+edi]
-	mul	eax
-	mov	DWORD [40+esi],eax
-	dec	ebx
-	mov	DWORD [44+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 6
-	mov	eax,DWORD [24+edi]
-	mul	eax
-	mov	DWORD [48+esi],eax
-	mov	DWORD [52+esi],edx
-L$019sw_end:
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_bn_div_words
-align	16
-_bn_div_words:
-L$_bn_div_words_begin:
-	mov	edx,DWORD [4+esp]
-	mov	eax,DWORD [8+esp]
-	mov	ecx,DWORD [12+esp]
-	div	ecx
-	ret
-global	_bn_add_words
-align	16
-_bn_add_words:
-L$_bn_add_words_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	; 
-	mov	ebx,DWORD [20+esp]
-	mov	esi,DWORD [24+esp]
-	mov	edi,DWORD [28+esp]
-	mov	ebp,DWORD [32+esp]
-	xor	eax,eax
-	and	ebp,4294967288
-	jz	NEAR L$020aw_finish
-L$021aw_loop:
-	; Round 0
-	mov	ecx,DWORD [esi]
-	mov	edx,DWORD [edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	mov	DWORD [ebx],ecx
-	; Round 1
-	mov	ecx,DWORD [4+esi]
-	mov	edx,DWORD [4+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	mov	DWORD [4+ebx],ecx
-	; Round 2
-	mov	ecx,DWORD [8+esi]
-	mov	edx,DWORD [8+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	mov	DWORD [8+ebx],ecx
-	; Round 3
-	mov	ecx,DWORD [12+esi]
-	mov	edx,DWORD [12+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	mov	DWORD [12+ebx],ecx
-	; Round 4
-	mov	ecx,DWORD [16+esi]
-	mov	edx,DWORD [16+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	mov	DWORD [16+ebx],ecx
-	; Round 5
-	mov	ecx,DWORD [20+esi]
-	mov	edx,DWORD [20+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	mov	DWORD [20+ebx],ecx
-	; Round 6
-	mov	ecx,DWORD [24+esi]
-	mov	edx,DWORD [24+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	mov	DWORD [24+ebx],ecx
-	; Round 7
-	mov	ecx,DWORD [28+esi]
-	mov	edx,DWORD [28+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	mov	DWORD [28+ebx],ecx
-	; 
-	add	esi,32
-	add	edi,32
-	add	ebx,32
-	sub	ebp,8
-	jnz	NEAR L$021aw_loop
-L$020aw_finish:
-	mov	ebp,DWORD [32+esp]
-	and	ebp,7
-	jz	NEAR L$022aw_end
-	; Tail Round 0
-	mov	ecx,DWORD [esi]
-	mov	edx,DWORD [edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [ebx],ecx
-	jz	NEAR L$022aw_end
-	; Tail Round 1
-	mov	ecx,DWORD [4+esi]
-	mov	edx,DWORD [4+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [4+ebx],ecx
-	jz	NEAR L$022aw_end
-	; Tail Round 2
-	mov	ecx,DWORD [8+esi]
-	mov	edx,DWORD [8+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [8+ebx],ecx
-	jz	NEAR L$022aw_end
-	; Tail Round 3
-	mov	ecx,DWORD [12+esi]
-	mov	edx,DWORD [12+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [12+ebx],ecx
-	jz	NEAR L$022aw_end
-	; Tail Round 4
-	mov	ecx,DWORD [16+esi]
-	mov	edx,DWORD [16+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [16+ebx],ecx
-	jz	NEAR L$022aw_end
-	; Tail Round 5
-	mov	ecx,DWORD [20+esi]
-	mov	edx,DWORD [20+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [20+ebx],ecx
-	jz	NEAR L$022aw_end
-	; Tail Round 6
-	mov	ecx,DWORD [24+esi]
-	mov	edx,DWORD [24+edi]
-	add	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	add	ecx,edx
-	adc	eax,0
-	mov	DWORD [24+ebx],ecx
-L$022aw_end:
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_bn_sub_words
-align	16
-_bn_sub_words:
-L$_bn_sub_words_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	; 
-	mov	ebx,DWORD [20+esp]
-	mov	esi,DWORD [24+esp]
-	mov	edi,DWORD [28+esp]
-	mov	ebp,DWORD [32+esp]
-	xor	eax,eax
-	and	ebp,4294967288
-	jz	NEAR L$023aw_finish
-L$024aw_loop:
-	; Round 0
-	mov	ecx,DWORD [esi]
-	mov	edx,DWORD [edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	mov	DWORD [ebx],ecx
-	; Round 1
-	mov	ecx,DWORD [4+esi]
-	mov	edx,DWORD [4+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	mov	DWORD [4+ebx],ecx
-	; Round 2
-	mov	ecx,DWORD [8+esi]
-	mov	edx,DWORD [8+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	mov	DWORD [8+ebx],ecx
-	; Round 3
-	mov	ecx,DWORD [12+esi]
-	mov	edx,DWORD [12+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	mov	DWORD [12+ebx],ecx
-	; Round 4
-	mov	ecx,DWORD [16+esi]
-	mov	edx,DWORD [16+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	mov	DWORD [16+ebx],ecx
-	; Round 5
-	mov	ecx,DWORD [20+esi]
-	mov	edx,DWORD [20+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	mov	DWORD [20+ebx],ecx
-	; Round 6
-	mov	ecx,DWORD [24+esi]
-	mov	edx,DWORD [24+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	mov	DWORD [24+ebx],ecx
-	; Round 7
-	mov	ecx,DWORD [28+esi]
-	mov	edx,DWORD [28+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	mov	DWORD [28+ebx],ecx
-	; 
-	add	esi,32
-	add	edi,32
-	add	ebx,32
-	sub	ebp,8
-	jnz	NEAR L$024aw_loop
-L$023aw_finish:
-	mov	ebp,DWORD [32+esp]
-	and	ebp,7
-	jz	NEAR L$025aw_end
-	; Tail Round 0
-	mov	ecx,DWORD [esi]
-	mov	edx,DWORD [edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [ebx],ecx
-	jz	NEAR L$025aw_end
-	; Tail Round 1
-	mov	ecx,DWORD [4+esi]
-	mov	edx,DWORD [4+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [4+ebx],ecx
-	jz	NEAR L$025aw_end
-	; Tail Round 2
-	mov	ecx,DWORD [8+esi]
-	mov	edx,DWORD [8+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [8+ebx],ecx
-	jz	NEAR L$025aw_end
-	; Tail Round 3
-	mov	ecx,DWORD [12+esi]
-	mov	edx,DWORD [12+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [12+ebx],ecx
-	jz	NEAR L$025aw_end
-	; Tail Round 4
-	mov	ecx,DWORD [16+esi]
-	mov	edx,DWORD [16+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [16+ebx],ecx
-	jz	NEAR L$025aw_end
-	; Tail Round 5
-	mov	ecx,DWORD [20+esi]
-	mov	edx,DWORD [20+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	dec	ebp
-	mov	DWORD [20+ebx],ecx
-	jz	NEAR L$025aw_end
-	; Tail Round 6
-	mov	ecx,DWORD [24+esi]
-	mov	edx,DWORD [24+edi]
-	sub	ecx,eax
-	mov	eax,0
-	adc	eax,eax
-	sub	ecx,edx
-	adc	eax,0
-	mov	DWORD [24+ebx],ecx
-L$025aw_end:
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-segment	.bss
-common	_OPENSSL_ia32cap_P 16
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/add.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/add.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/add.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/add.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/asm/x86_64-gcc.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/asm/x86_64-gcc.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/asm/x86_64-gcc.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/asm/x86_64-gcc.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bn.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bn.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bn.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bn.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c.inc
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c.inc
index e3475518..124f4418 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/bytes.c.inc
@@ -186,7 +186,7 @@ void bn_assert_fits_in_bytes(const BIGNUM *bn, size_t num) {
 void bn_words_to_big_endian(uint8_t *out, size_t out_len, const BN_ULONG *in,
                             size_t in_len) {
   // The caller should have selected an output length without truncation.
-  assert(fits_in_bytes(in, in_len, out_len));
+  declassify_assert(fits_in_bytes(in, in_len, out_len));
 
   // We only support little-endian platforms, so the internal representation is
   // also little-endian as bytes. We can simply copy it in reverse.
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/cmp.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/cmp.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/cmp.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/cmp.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/ctx.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/ctx.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/ctx.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/ctx.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c.inc
similarity index 75%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c.inc
index 4233f4a9..f15843fd 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div.c.inc
@@ -149,11 +149,11 @@ static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out,
   //   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65668
   //
   // Clang bugs:
-  //   * https://llvm.org/bugs/show_bug.cgi?id=6397
-  //   * https://llvm.org/bugs/show_bug.cgi?id=12418
+  //   * https://github.com/llvm/llvm-project/issues/6769
+  //   * https://github.com/llvm/llvm-project/issues/12790
   //
-  // These issues aren't specific to x86 and x86_64, so it might be worthwhile
-  // to add more assembly language implementations.
+  // These is specific to x86 and x86_64; Arm and RISC-V do not have double-wide
+  // division instructions.
 #if defined(BN_CAN_USE_INLINE_ASM) && defined(OPENSSL_X86)
   __asm__ volatile("divl %4"
                    : "=a"(*quotient_out), "=d"(*rem_out)
@@ -175,44 +175,16 @@ static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out,
 #endif
 }
 
-// BN_div computes "quotient := numerator / divisor", rounding towards zero,
-// and sets up |rem| such that "quotient * divisor + rem = numerator" holds.
-//
-// Thus:
-//
-//     quotient->neg == numerator->neg ^ divisor->neg
-//        (unless the result is zero)
-//     rem->neg == numerator->neg
-//        (unless the remainder is zero)
-//
-// If |quotient| or |rem| is NULL, the respective value is not returned.
-//
-// This was specifically designed to contain fewer branches that may leak
-// sensitive information; see "New Branch Prediction Vulnerabilities in OpenSSL
-// and Necessary Software Countermeasures" by Onur Acıçmez, Shay Gueron, and
-// Jean-Pierre Seifert.
 int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
            const BIGNUM *divisor, BN_CTX *ctx) {
-  int norm_shift, loop;
-  BIGNUM wnum;
-  BN_ULONG *resp, *wnump;
-  BN_ULONG d0, d1;
-  int num_n, div_n;
-
-  // This function relies on the historical minimal-width |BIGNUM| invariant.
-  // It is already not constant-time (constant-time reductions should use
-  // Montgomery logic), so we shrink all inputs and intermediate values to
-  // retain the previous behavior.
-
-  // Invalid zero-padding would have particularly bad consequences.
-  int numerator_width = bn_minimal_width(numerator);
-  int divisor_width = bn_minimal_width(divisor);
-  if ((numerator_width > 0 && numerator->d[numerator_width - 1] == 0) ||
-      (divisor_width > 0 && divisor->d[divisor_width - 1] == 0)) {
-    OPENSSL_PUT_ERROR(BN, BN_R_NOT_INITIALIZED);
-    return 0;
-  }
-
+  // This function implements long division, per Knuth, The Art of Computer
+  // Programming, Volume 2, Chapter 4.3.1, Algorithm D. This algorithm only
+  // divides non-negative integers, but we round towards zero, so we divide
+  // absolute values and adjust the signs separately.
+  //
+  // Inputs to this function are assumed public and may be leaked by timing and
+  // cache side channels. Division with secret inputs should use other
+  // implementation strategies such as Montgomery reduction.
   if (BN_is_zero(divisor)) {
     OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO);
     return 0;
@@ -222,174 +194,168 @@ int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
   BIGNUM *tmp = BN_CTX_get(ctx);
   BIGNUM *snum = BN_CTX_get(ctx);
   BIGNUM *sdiv = BN_CTX_get(ctx);
-  BIGNUM *res = NULL;
-  if (quotient == NULL) {
-    res = BN_CTX_get(ctx);
-  } else {
-    res = quotient;
-  }
-  if (sdiv == NULL || res == NULL) {
+  BIGNUM *res = quotient == NULL ? BN_CTX_get(ctx) : quotient;
+  if (tmp == NULL || snum == NULL || sdiv == NULL || res == NULL) {
     goto err;
   }
 
-  // First we normalise the numbers
-  norm_shift = BN_BITS2 - (BN_num_bits(divisor) % BN_BITS2);
-  if (!BN_lshift(sdiv, divisor, norm_shift)) {
+  // Knuth step D1: Normalise the numbers such that the divisor's MSB is set.
+  // This ensures, in Knuth's terminology, that v1 >= b/2, needed for the
+  // quotient estimation step.
+  int norm_shift = BN_BITS2 - (BN_num_bits(divisor) % BN_BITS2);
+  if (!BN_lshift(sdiv, divisor, norm_shift) ||
+      !BN_lshift(snum, numerator, norm_shift)) {
     goto err;
   }
+
+  // This algorithm relies on |sdiv| being minimal width. We do not use this
+  // function on secret inputs, so leaking this is fine. Also minimize |snum| to
+  // avoid looping on leading zeros, as we're not trying to be leak-free.
   bn_set_minimal_width(sdiv);
-  sdiv->neg = 0;
-  norm_shift += BN_BITS2;
-  if (!BN_lshift(snum, numerator, norm_shift)) {
-    goto err;
-  }
   bn_set_minimal_width(snum);
-  snum->neg = 0;
-
-  // Since we don't want to have special-case logic for the case where snum is
-  // larger than sdiv, we pad snum with enough zeroes without changing its
-  // value.
-  if (snum->width <= sdiv->width + 1) {
-    if (!bn_wexpand(snum, sdiv->width + 2)) {
-      goto err;
-    }
-    for (int i = snum->width; i < sdiv->width + 2; i++) {
-      snum->d[i] = 0;
-    }
-    snum->width = sdiv->width + 2;
-  } else {
-    if (!bn_wexpand(snum, snum->width + 1)) {
-      goto err;
-    }
-    snum->d[snum->width] = 0;
-    snum->width++;
-  }
-
-  div_n = sdiv->width;
-  num_n = snum->width;
-  loop = num_n - div_n;
-  // Lets setup a 'window' into snum
-  // This is the part that corresponds to the current
-  // 'area' being divided
-  wnum.neg = 0;
-  wnum.d = &(snum->d[loop]);
-  wnum.width = div_n;
-  // only needed when BN_ucmp messes up the values between width and max
-  wnum.dmax = snum->dmax - loop;  // so we don't step out of bounds
-
-  // Get the top 2 words of sdiv
-  // div_n=sdiv->width;
-  d0 = sdiv->d[div_n - 1];
-  d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];
-
-  // pointer to the 'top' of snum
-  wnump = &(snum->d[num_n - 1]);
-
-  // Setup |res|. |numerator| and |res| may alias, so we save |numerator->neg|
-  // for later.
-  const int numerator_neg = numerator->neg;
-  res->neg = (numerator_neg ^ divisor->neg);
-  if (!bn_wexpand(res, loop + 1)) {
+  int div_n = sdiv->width;
+  const BN_ULONG d0 = sdiv->d[div_n - 1];
+  const BN_ULONG d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];
+  assert(d0 & (((BN_ULONG)1) << (BN_BITS2 - 1)));
+
+  // Extend |snum| with zeros to satisfy the long division invariants:
+  // - |snum| must have at least |div_n| + 1 words.
+  // - |snum|'s most significant word must be zero to guarantee the first loop
+  //   iteration works with a prefix greater than |sdiv|. (This is the extra u0
+  //   digit in Knuth step D1.)
+  int num_n = snum->width <= div_n ? div_n + 1 : snum->width + 1;
+  if (!bn_resize_words(snum, num_n)) {
     goto err;
   }
-  res->width = loop - 1;
-  resp = &(res->d[loop - 1]);
 
-  // space for temp
-  if (!bn_wexpand(tmp, div_n + 1)) {
+  // Knuth step D2: The quotient's width is the difference between numerator and
+  // denominator. Also set up its sign and size a temporary for the loop.
+  int loop = num_n - div_n;
+  res->neg = snum->neg ^ sdiv->neg;
+  if (!bn_wexpand(res, loop) ||  //
+      !bn_wexpand(tmp, div_n + 1)) {
     goto err;
   }
-
-  // if res->width == 0 then clear the neg value otherwise decrease
-  // the resp pointer
-  if (res->width == 0) {
-    res->neg = 0;
-  } else {
-    resp--;
-  }
-
-  for (int i = 0; i < loop - 1; i++, wnump--, resp--) {
-    BN_ULONG q, l0;
-    // the first part of the loop uses the top two words of snum and sdiv to
-    // calculate a BN_ULONG q such that | wnum - sdiv * q | < sdiv
-    BN_ULONG n0, n1, rm = 0;
-
-    n0 = wnump[0];
-    n1 = wnump[-1];
+  res->width = loop;
+
+  // Knuth steps D2 through D7: Compute the quotient with a word-by-word long
+  // division. Note that Knuth indexes words from most to least significant, so
+  // our index is reversed. Each loop iteration computes res->d[i] of the
+  // quotient and updates snum with the running remainder. Before each loop
+  // iteration, the div_n words beginning at snum->d[i+1] must be less than
+  // snum.
+  for (int i = loop - 1; i >= 0; i--) {
+    // The next word of the quotient, q, is floor(wnum / sdiv), where wnum is
+    // the div_n + 1 words beginning at snum->d[i]. i starts at
+    // num_n - div_n - 1, so there are at least div_n + 1 words available.
+    //
+    // Knuth step D3: Compute q', an estimate of q by looking at the top words
+    // of wnum and sdiv. We must estimate such that q' = q or q' = q + 1.
+    BN_ULONG q, rm = 0;
+    BN_ULONG *wnum = snum->d + i;
+    BN_ULONG n0 = wnum[div_n];
+    BN_ULONG n1 = wnum[div_n - 1];
     if (n0 == d0) {
+      // Estimate q' = b - 1, where b is the base.
       q = BN_MASK2;
+      // Knuth also runs the fixup routine in this case, but this would require
+      // computing rm and is unnecessary. q' is already close enough. That is,
+      // the true quotient, q is either b - 1 or b - 2.
+      //
+      // By the loop invariant, q <= b - 1, so we must show that q >= b - 2. We
+      // do this by showing wnum / sdiv >= b - 2. Suppose wnum / sdiv < b - 2.
+      // wnum and sdiv have the same most significant word, so:
+      //
+      //    wnum >= n0 * b^div_n
+      //    sdiv <  (n0 + 1) * b^(d_div - 1)
+      //
+      // Thus:
+      //
+      //    b - 2 > wnum / sdiv
+      //          > (n0 * b^div_n) / (n0 + 1) * b^(div_n - 1)
+      //          = (n0 * b) / (n0 + 1)
+      //
+      //         (n0 + 1) * (b - 2) > n0 * b
+      //    n0 * b + b - 2 * n0 - 2 > n0 * b
+      //                      b - 2 > 2 * n0
+      //                    b/2 - 1 > n0
+      //
+      // This contradicts the normalization condition, so q >= b - 2 and our
+      // estimate is close enough.
     } else {
-      // n0 < d0
+      // Estimate q' = floor(n0n1 / d0). Per Theorem B, q' - 2 <= q <= q', which
+      // is slightly outside of our bounds.
+      assert(n0 < d0);
       bn_div_rem_words(&q, &rm, n0, n1, d0);
 
+      // Fix the estimate by examining one more word and adjusting q' as needed.
+      // This is the second half of step D3 and is sufficient per exercises 19,
+      // 20, and 21. Although only one iteration is needed to correct q + 2 to
+      // q + 1, Knuth uses a loop. A loop will often also correct q + 1 to q,
+      // saving the slightly more expensive underflow handling below.
+      if (div_n > 1) {
+        BN_ULONG n2 = wnum[div_n - 2];
 #ifdef BN_ULLONG
-      BN_ULLONG t2 = (BN_ULLONG)d1 * q;
-      for (;;) {
-        if (t2 <= ((((BN_ULLONG)rm) << BN_BITS2) | wnump[-2])) {
-          break;
+        BN_ULLONG t2 = (BN_ULLONG)d1 * q;
+        for (;;) {
+          if (t2 <= ((((BN_ULLONG)rm) << BN_BITS2) | n2)) {
+            break;
+          }
+          q--;
+          rm += d0;
+          if (rm < d0) {
+            // If rm overflows, the true value exceeds BN_ULONG and the next
+            // t2 comparison should exit the loop.
+            break;
+          }
+          t2 -= d1;
         }
-        q--;
-        rm += d0;
-        if (rm < d0) {
-          break;  // don't let rm overflow
-        }
-        t2 -= d1;
-      }
-#else  // !BN_ULLONG
-      BN_ULONG t2l, t2h;
-      BN_UMULT_LOHI(t2l, t2h, d1, q);
-      for (;;) {
-        if (t2h < rm ||
-            (t2h == rm && t2l <= wnump[-2])) {
-          break;
+#else   // !BN_ULLONG
+        BN_ULONG t2l, t2h;
+        BN_UMULT_LOHI(t2l, t2h, d1, q);
+        for (;;) {
+          if (t2h < rm || (t2h == rm && t2l <= n2)) {
+            break;
+          }
+          q--;
+          rm += d0;
+          if (rm < d0) {
+            // If rm overflows, the true value exceeds BN_ULONG and the next
+            // t2 comparison should exit the loop.
+            break;
+          }
+          if (t2l < d1) {
+            t2h--;
+          }
+          t2l -= d1;
         }
-        q--;
-        rm += d0;
-        if (rm < d0) {
-          break;  // don't let rm overflow
-        }
-        if (t2l < d1) {
-          t2h--;
-        }
-        t2l -= d1;
-      }
 #endif  // !BN_ULLONG
+      }
     }
 
-    l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q);
-    tmp->d[div_n] = l0;
-    wnum.d--;
-    // ingore top values of the bignums just sub the two
-    // BN_ULONG arrays with bn_sub_words
-    if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n + 1)) {
-      // Note: As we have considered only the leading
-      // two BN_ULONGs in the calculation of q, sdiv * q
-      // might be greater than wnum (but then (q-1) * sdiv
-      // is less or equal than wnum)
+    // Knuth step D4 through D6: Now q' = q or q' = q + 1, and
+    // -sdiv < wnum - sdiv * q < sdiv. If q' = q + 1, the subtraction will
+    // underflow, and we fix it up below.
+    tmp->d[div_n] = bn_mul_words(tmp->d, sdiv->d, div_n, q);
+    if (bn_sub_words(wnum, wnum, tmp->d, div_n + 1)) {
       q--;
-      if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) {
-        // we can't have an overflow here (assuming
-        // that q != 0, but if q == 0 then tmp is
-        // zero anyway)
-        (*wnump)++;
-      }
+      // The final addition is expected to overflow, canceling the underflow.
+      wnum[div_n] += bn_add_words(wnum, wnum, sdiv->d, div_n);
     }
-    // store part of the result
-    *resp = q;
+
+    // q is now correct, and wnum has been updated to the running remainder.
+    res->d[i] = q;
   }
 
+  // Trim leading zeros and correct any negative zeros.
   bn_set_minimal_width(snum);
+  bn_set_minimal_width(res);
 
-  if (rem != NULL) {
-    if (!BN_rshift(rem, snum, norm_shift)) {
-      goto err;
-    }
-    if (!BN_is_zero(rem)) {
-      rem->neg = numerator_neg;
-    }
+  // Knuth step D8: Unnormalize. snum now contains the remainder.
+  if (rem != NULL && !BN_rshift(rem, snum, norm_shift)) {
+    goto err;
   }
 
-  bn_set_minimal_width(res);
   BN_CTX_end(ctx);
   return 1;
 
@@ -406,8 +372,9 @@ int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx) {
     return 1;
   }
 
-  // now -|d| < r < 0, so we have to set r := r + |d|.
-  return (d->neg ? BN_sub : BN_add)(r, r, d);
+  // now -d < r < 0, so we have to set r := r + d. Ignoring the sign bits, this
+  // is r = d - r.
+  return BN_usub(r, d, r);
 }
 
 BN_ULONG bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry,
@@ -425,7 +392,7 @@ BN_ULONG bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry,
   //
   // Although |carry| may be one if it was one on input and |bn_sub_words|
   // returns zero, this would give |r| > |m|, violating our input assumptions.
-  assert(carry == 0 || carry == (BN_ULONG)-1);
+  declassify_assert(carry + 1 <= 1);
   bn_select_words(r, carry, a /* r < 0 */, r /* r >= 0 */, num);
   return carry;
 }
@@ -434,7 +401,7 @@ BN_ULONG bn_reduce_once_in_place(BN_ULONG *r, BN_ULONG carry, const BN_ULONG *m,
                                  BN_ULONG *tmp, size_t num) {
   // See |bn_reduce_once| for why this logic works.
   carry -= bn_sub_words(tmp, r, m, num);
-  assert(carry == 0 || carry == (BN_ULONG)-1);
+  declassify_assert(carry + 1 <= 1);
   bn_select_words(r, carry, r /* tmp < 0 */, tmp /* tmp >= 0 */, num);
   return carry;
 }
@@ -504,7 +471,7 @@ int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder,
   // |divisor_min_bits| bits, the top |divisor_min_bits - 1| can be incorporated
   // without reductions. This significantly speeds up |RSA_check_key|. For
   // simplicity, we round down to a whole number of words.
-  assert(divisor_min_bits <= BN_num_bits(divisor));
+  declassify_assert(divisor_min_bits <= BN_num_bits(divisor));
   int initial_words = 0;
   if (divisor_min_bits > 0) {
     initial_words = (divisor_min_bits - 1) / BN_BITS2;
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c.inc
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c.inc
index f5d5eeec..41c5dbdc 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/div_extra.c.inc
@@ -39,7 +39,7 @@ static uint16_t mod_u16(uint32_t n, uint16_t d, uint32_t p, uint32_t m) {
 
   // Multiply and subtract to get the remainder.
   n -= d * t;
-  assert(n < d);
+  declassify_assert(n < d);
   return n;
 }
 
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c.inc
similarity index 94%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c.inc
index 64b52572..b18520e7 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/exponentiation.c.inc
@@ -119,6 +119,50 @@
 #include "internal.h"
 #include "rsaz_exp.h"
 
+#if defined(OPENSSL_BN_ASM_MONT5)
+
+// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it
+// by |ap| modulo |np|, and stores the result in |rp|. The values are |num|
+// words long and represented in Montgomery form. |n0| is a pointer to the
+// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least
+// 16 bytes. |power| must be less than 32 and is treated as secret.
+//
+// WARNING: This function implements Almost Montgomery Multiplication from
+// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
+// However, even if they are fully reduced, the output may not be.
+static void bn_mul_mont_gather5(
+    BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, const BN_ULONG *np,
+    const BN_ULONG *n0, int num, int power) {
+  if (bn_mulx4x_mont_gather5_capable(num)) {
+    bn_mulx4x_mont_gather5(rp, ap, table, np, n0, num, power);
+  } else if (bn_mul4x_mont_gather5_capable(num)) {
+    bn_mul4x_mont_gather5(rp, ap, table, np, n0, num, power);
+  } else {
+    bn_mul_mont_gather5_nohw(rp, ap, table, np, n0, num, power);
+  }
+}
+
+// bn_power5 squares |ap| five times and multiplies it by the value stored at
+// index |power| of |table|, modulo |np|. It stores the result in |rp|. The
+// values are |num| words long and represented in Montgomery form. |n0| is a
+// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible
+// by 8. |power| must be less than 32 and is treated as secret.
+//
+// WARNING: This function implements Almost Montgomery Multiplication from
+// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
+// However, even if they are fully reduced, the output may not be.
+static void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
+                      const BN_ULONG *np, const BN_ULONG *n0, int num,
+                      int power) {
+  assert(bn_power5_capable(num));
+  if (bn_powerx5_capable(num)) {
+    bn_powerx5(rp, ap, table, np, n0, num, power);
+  } else {
+    bn_power5_nohw(rp, ap, table, np, n0, num, power);
+  }
+}
+
+#endif // defined(OPENSSL_BN_ASM_MONT5)
 
 int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
   int i, bits, ret = 0;
@@ -1013,7 +1057,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 
   // Prepare a^1 in the Montgomery domain.
   assert(!a->neg);
-  assert(BN_ucmp(a, m) < 0);
+  declassify_assert(BN_ucmp(a, m) < 0);
   if (!BN_to_montgomery(&am, a, mont, ctx) ||
       !bn_resize_words(&am, top)) {
     goto err;
@@ -1079,7 +1123,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 
     // Scan the exponent one window at a time starting from the most
     // significant bits.
-    if (top & 7) {
+    if (!bn_power5_capable(top)) {
       while (bits >= 0) {
         for (wvalue = 0, i = 0; i < 5; i++, bits--) {
           wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c.inc
similarity index 96%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c.inc
index 8fe14379..249ffbf8 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/gcd_extra.c.inc
@@ -93,7 +93,7 @@ static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x,
     // At least one of |u| and |v| is now even.
     BN_ULONG u_is_odd = word_is_odd_mask(u->d[0]);
     BN_ULONG v_is_odd = word_is_odd_mask(v->d[0]);
-    assert(!(u_is_odd & v_is_odd));
+    declassify_assert(!(u_is_odd & v_is_odd));
 
     // If both are even, the final GCD gains a factor of two.
     shift += 1 & (~u_is_odd & ~v_is_odd);
@@ -106,7 +106,7 @@ static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x,
   // One of |u| or |v| is zero at this point. The algorithm usually makes |u|
   // zero, unless |y| was already zero on input. Fix this by combining the
   // values.
-  assert(BN_is_zero(u) || BN_is_zero(v));
+  declassify_assert(BN_is_zero(u) | BN_is_zero(v));
   for (size_t i = 0; i < width; i++) {
     v->d[i] |= u->d[i];
   }
@@ -289,7 +289,7 @@ int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, const BIGNUM *a,
     // and |v| is now even.
     BN_ULONG u_is_even = ~word_is_odd_mask(u->d[0]);
     BN_ULONG v_is_even = ~word_is_odd_mask(v->d[0]);
-    assert(u_is_even != v_is_even);
+    declassify_assert(u_is_even != v_is_even);
 
     // Halve the even one and adjust the corresponding coefficient.
     maybe_rshift1_words(u->d, u_is_even, tmp->d, n_width);
@@ -313,8 +313,11 @@ int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, const BIGNUM *a,
     maybe_rshift1_words_carry(D->d, D_carry, v_is_even, tmp->d, a_width);
   }
 
-  assert(BN_is_zero(v));
-  if (!BN_is_one(u)) {
+  declassify_assert(BN_is_zero(v));
+  // While the inputs and output are secret, this function considers whether the
+  // input was invertible to be public. It is used as part of RSA key
+  // generation, where inputs are chosen to already be invertible.
+  if (constant_time_declassify_int(!BN_is_one(u))) {
     *out_no_inverse = 1;
     OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
     goto err;
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/generic.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/generic.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/generic.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/generic.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/internal.h
index dd3ee9fc..570276a6 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/internal.h
@@ -438,18 +438,26 @@ int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
 #define OPENSSL_BN_ASM_MONT5
 
-// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it
-// by |ap| modulo |np|, and stores the result in |rp|. The values are |num|
-// words long and represented in Montgomery form. |n0| is a pointer to the
-// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least
-// 16 bytes. |power| must be less than 32 and is treated as secret.
-//
-// WARNING: This function implements Almost Montgomery Multiplication from
-// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
-// However, even if they are fully reduced, the output may not be.
-void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
-                         const BN_ULONG *table, const BN_ULONG *np,
-                         const BN_ULONG *n0, int num, int power);
+// The following functions implement |bn_mul_mont_gather5|. See
+// |bn_mul_mont_gather5| for details.
+OPENSSL_INLINE int bn_mul4x_mont_gather5_capable(int num) {
+  return (num & 7) == 0;
+}
+void bn_mul4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
+                           const BN_ULONG *table, const BN_ULONG *np,
+                           const BN_ULONG *n0, int num, int power);
+
+OPENSSL_INLINE int bn_mulx4x_mont_gather5_capable(int num) {
+  return bn_mul4x_mont_gather5_capable(num) && CRYPTO_is_ADX_capable() &&
+         CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable();
+}
+void bn_mulx4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
+                            const BN_ULONG *table, const BN_ULONG *np,
+                            const BN_ULONG *n0, int num, int power);
+
+void bn_mul_mont_gather5_nohw(BN_ULONG *rp, const BN_ULONG *ap,
+                              const BN_ULONG *table, const BN_ULONG *np,
+                              const BN_ULONG *n0, int num, int power);
 
 // bn_scatter5 stores |inp| to index |power| of |table|. |inp| and each entry of
 // |table| are |num| words long. |power| must be less than 32 and is treated as
@@ -463,17 +471,19 @@ void bn_scatter5(const BN_ULONG *inp, size_t num, BN_ULONG *table,
 // is treated as secret. |table| must be aligned to at least 16 bytes.
 void bn_gather5(BN_ULONG *out, size_t num, const BN_ULONG *table, size_t power);
 
-// bn_power5 squares |ap| five times and multiplies it by the value stored at
-// index |power| of |table|, modulo |np|. It stores the result in |rp|. The
-// values are |num| words long and represented in Montgomery form. |n0| is a
-// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible
-// by 8. |power| must be less than 32 and is treated as secret.
-//
-// WARNING: This function implements Almost Montgomery Multiplication from
-// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
-// However, even if they are fully reduced, the output may not be.
-void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
-               const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+// The following functions implement |bn_power5|. See |bn_power5| for details.
+void bn_power5_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
+                    const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+
+OPENSSL_INLINE int bn_power5_capable(int num) { return (num & 7) == 0; }
+
+OPENSSL_INLINE int bn_powerx5_capable(int num) {
+  return bn_power5_capable(num) && CRYPTO_is_ADX_capable() &&
+         CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable();
+}
+void bn_powerx5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
+                const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+
 #endif  // !OPENSSL_NO_ASM && OPENSSL_X86_64
 
 uint64_t bn_mont_n0(const BIGNUM *n);
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/jacobi.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/jacobi.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/jacobi.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/jacobi.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c.inc
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c.inc
index 1f71dd44..04a491eb 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/montgomery_inv.c.inc
@@ -153,7 +153,7 @@ static uint64_t bn_neg_inv_mod_r_u64(uint64_t n) {
 
   // The invariant now shows that u*r - v*n == 1 since r == 2 * alpha.
 #if BN_BITS2 == 64 && defined(BN_ULLONG)
-  assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
+  declassify_assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
 #endif
 
   return v;
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c.inc
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c.inc
index 5ae6b49b..e7fd224d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/mul.c.inc
@@ -292,7 +292,7 @@ static void bn_mul_recursive(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
   }
 
   // The product should fit without carries.
-  assert(c == 0);
+  declassify_assert(c == 0);
 }
 
 // bn_mul_part_recursive sets |r| to |a| * |b|, using |t| as scratch space. |r|
@@ -406,7 +406,7 @@ static void bn_mul_part_recursive(BN_ULONG *r, const BN_ULONG *a,
   }
 
   // The product should fit without carries.
-  assert(c == 0);
+  declassify_assert(c == 0);
 }
 
 // bn_mul_impl implements |BN_mul| and |bn_mul_consttime|. Note this function
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c.inc
similarity index 97%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c.inc
index d5891394..ee0ce622 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/prime.c.inc
@@ -487,7 +487,10 @@ int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
 static int bn_trial_division(uint16_t *out, const BIGNUM *bn) {
   const size_t num_primes = num_trial_division_primes(bn);
   for (size_t i = 1; i < num_primes; i++) {
-    if (bn_mod_u16_consttime(bn, kPrimes[i]) == 0) {
+    // During RSA key generation, |bn| may be secret, but only if |bn| was
+    // prime, so it is safe to leak failed trial divisions.
+    if (constant_time_declassify_int(bn_mod_u16_consttime(bn, kPrimes[i]) ==
+                                     0)) {
       *out = kPrimes[i];
       return 1;
     }
@@ -573,7 +576,8 @@ int bn_miller_rabin_iteration(const BN_MILLER_RABIN *miller_rabin,
   // To avoid leaking |a|, we run the loop to |w_bits| and mask off all
   // iterations once |j| = |a|.
   for (int j = 1; j < miller_rabin->w_bits; j++) {
-    if (constant_time_eq_int(j, miller_rabin->a) & ~is_possibly_prime) {
+    if (constant_time_declassify_w(constant_time_eq_int(j, miller_rabin->a) &
+                                   ~is_possibly_prime)) {
       // If the loop is done and we haven't seen z = 1 or z = w-1 yet, the
       // value is composite and we can break in variable time.
       break;
@@ -593,12 +597,14 @@ int bn_miller_rabin_iteration(const BN_MILLER_RABIN *miller_rabin,
     // Step 4.5.3. If z = 1 and the loop is not done, the previous value of z
     // was not -1. There are no non-trivial square roots of 1 modulo a prime, so
     // w is composite and we may exit in variable time.
-    if (BN_equal_consttime(z, miller_rabin->one_mont) & ~is_possibly_prime) {
+    if (constant_time_declassify_w(
+            BN_equal_consttime(z, miller_rabin->one_mont) &
+            ~is_possibly_prime)) {
       break;
     }
   }
 
-  *out_is_possibly_prime = is_possibly_prime & 1;
+  *out_is_possibly_prime = constant_time_declassify_w(is_possibly_prime) & 1;
   ret = 1;
 
 err:
@@ -736,8 +742,9 @@ int BN_primality_test(int *out_is_probably_prime, const BIGNUM *w, int checks,
   crypto_word_t uniform_iterations = 0;
   // Using |constant_time_lt_w| seems to prevent the compiler from optimizing
   // this into two jumps.
-  for (int i = 1; (i <= BN_PRIME_CHECKS_BLINDED) |
-                  constant_time_lt_w(uniform_iterations, checks);
+  for (int i = 1; constant_time_declassify_w(
+           (i <= BN_PRIME_CHECKS_BLINDED) |
+           constant_time_lt_w(uniform_iterations, checks));
        i++) {
     // Step 4.1-4.2
     int is_uniform;
@@ -766,7 +773,7 @@ int BN_primality_test(int *out_is_probably_prime, const BIGNUM *w, int checks,
     }
   }
 
-  assert(uniform_iterations >= (crypto_word_t)checks);
+  declassify_assert(uniform_iterations >= (crypto_word_t)checks);
   *out_is_probably_prime = 1;
   ret = 1;
 
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c.inc
similarity index 96%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c.inc
index 91d337fd..5e25b93d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/random.c.inc
@@ -113,10 +113,9 @@
 #include <string.h>
 
 #include <CCryptoBoringSSL_err.h>
-#include <CCryptoBoringSSL_rand.h>
 
 #include "../../internal.h"
-#include "../rand/internal.h"
+#include "../bcm_interface.h"
 #include "../service_indicator/internal.h"
 #include "internal.h"
 
@@ -157,7 +156,7 @@ int BN_rand(BIGNUM *rnd, int bits, int top, int bottom) {
   }
 
   FIPS_service_indicator_lock_state();
-  RAND_bytes((uint8_t *)rnd->d, words * sizeof(BN_ULONG));
+  BCM_rand_bytes((uint8_t *)rnd->d, words * sizeof(BN_ULONG));
   FIPS_service_indicator_unlock_state();
 
   rnd->d[words - 1] &= mask;
@@ -225,8 +224,7 @@ static int bn_range_to_mask(size_t *out_words, BN_ULONG *out_mask,
   while (words > 0 && max_exclusive[words - 1] == 0) {
     words--;
   }
-  if (words == 0 ||
-      (words == 1 && max_exclusive[0] <= min_inclusive)) {
+  if (words == 0 || (words == 1 && max_exclusive[0] <= min_inclusive)) {
     OPENSSL_PUT_ERROR(BN, BN_R_INVALID_RANGE);
     return 0;
   }
@@ -275,8 +273,8 @@ int bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive,
     // Steps 4 and 5. Use |words| and |mask| together to obtain a string of N
     // bits, where N is the bit length of |max_exclusive|.
     FIPS_service_indicator_lock_state();
-    RAND_bytes_with_additional_data((uint8_t *)out, words * sizeof(BN_ULONG),
-                                    additional_data);
+    BCM_rand_bytes_with_additional_data(
+        (uint8_t *)out, words * sizeof(BN_ULONG), additional_data);
     FIPS_service_indicator_unlock_state();
     out[words - 1] &= mask;
 
@@ -326,7 +324,7 @@ int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive,
 
   // Select a uniform random number with num_bits(max_exclusive) bits.
   FIPS_service_indicator_lock_state();
-  RAND_bytes((uint8_t *)r->d, words * sizeof(BN_ULONG));
+  BCM_rand_bytes((uint8_t *)r->d, words * sizeof(BN_ULONG));
   FIPS_service_indicator_unlock_state();
   r->d[words - 1] &= mask;
 
@@ -339,7 +337,8 @@ int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive,
   // If the value is not in range, force it to be in range.
   r->d[0] |= constant_time_select_w(in_range, 0, min_inclusive);
   r->d[words - 1] &= constant_time_select_w(in_range, BN_MASK2, mask >> 1);
-  assert(bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words));
+  declassify_assert(
+      bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words));
 
   r->neg = 0;
   r->width = (int)words;
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/rsaz_exp.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/rsaz_exp.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/rsaz_exp.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/rsaz_exp.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/shift.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/shift.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/shift.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/shift.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c.inc
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c.inc
index 13c7892c..b4b98158 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn/sqrt.c.inc
@@ -236,7 +236,7 @@ BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
         goto end;
       }
       if (BN_ucmp(y, p) >= 0) {
-        if (!(p->neg ? BN_add : BN_sub)(y, y, p)) {
+        if (BN_usub(y, y, p)) {
           goto end;
         }
       }
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-ios.ios.arm.S
deleted file mode 100644
index c7bc35e8..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-ios.ios.arm.S
+++ /dev/null
@@ -1,1534 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__APPLE__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
-@
-@ Licensed under the OpenSSL license (the "License").  You may not use
-@ this file except in compliance with the License.  You can obtain a copy
-@ in the file LICENSE in the source distribution or at
-@ https://www.openssl.org/source/license.html
-
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@
-@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
-@ of Linaro. Permission to use under GPL terms is granted.
-@ ====================================================================
-
-@ Bit-sliced AES for ARM NEON
-@
-@ February 2012.
-@
-@ This implementation is direct adaptation of bsaes-x86_64 module for
-@ ARM NEON. Except that this module is endian-neutral [in sense that
-@ it can be compiled for either endianness] by courtesy of vld1.8's
-@ neutrality. Initial version doesn't implement interface to OpenSSL,
-@ only low-level primitives and unsupported entry points, just enough
-@ to collect performance results, which for Cortex-A8 core are:
-@
-@ encrypt	19.5 cycles per byte processed with 128-bit key
-@ decrypt	22.1 cycles per byte processed with 128-bit key
-@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
-@
-@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
-@ which is [much] worse than anticipated (for further details see
-@ http://www.openssl.org/~appro/Snapdragon-S4.html).
-@
-@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
-@ manages in 20.0 cycles].
-@
-@ When comparing to x86_64 results keep in mind that NEON unit is
-@ [mostly] single-issue and thus can't [fully] benefit from
-@ instruction-level parallelism. And when comparing to aes-armv4
-@ results keep in mind key schedule conversion overhead (see
-@ bsaes-x86_64.pl for further details)...
-@
-@						<appro@openssl.org>
-
-@ April-August 2013
-@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
-
-#ifndef __KERNEL__
-# include <CCryptoBoringSSL_arm_arch.h>
-
-# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
-# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
-# define VFP_ABI_FRAME	0x40
-#else
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-# define VFP_ABI_FRAME	0
-# define BSAES_ASM_EXTENDED_KEY
-# define XTS_CHAIN_TWEAK
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-#ifdef __thumb__
-# define adrl adr
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.text
-.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
-#if defined(__thumb2__) && !defined(__APPLE__)
-.thumb
-#else
-.code	32
-# undef __thumb2__
-#endif
-
-#ifdef __thumb2__
-.thumb_func	_bsaes_decrypt8
-#endif
-.align	4
-_bsaes_decrypt8:
-	adr	r6,.
-	vldmia	r4!, {q9}		@ round 0 key
-#if defined(__thumb2__) || defined(__APPLE__)
-	adr	r6,LM0ISR
-#else
-	add	r6,r6,#LM0ISR-_bsaes_decrypt8
-#endif
-
-	vldmia	r6!, {q8}		@ LM0ISR
-	veor	q10, q0, q9	@ xor with round0 key
-	veor	q11, q1, q9
-	vtbl.8	d0, {q10}, d16
-	vtbl.8	d1, {q10}, d17
-	veor	q12, q2, q9
-	vtbl.8	d2, {q11}, d16
-	vtbl.8	d3, {q11}, d17
-	veor	q13, q3, q9
-	vtbl.8	d4, {q12}, d16
-	vtbl.8	d5, {q12}, d17
-	veor	q14, q4, q9
-	vtbl.8	d6, {q13}, d16
-	vtbl.8	d7, {q13}, d17
-	veor	q15, q5, q9
-	vtbl.8	d8, {q14}, d16
-	vtbl.8	d9, {q14}, d17
-	veor	q10, q6, q9
-	vtbl.8	d10, {q15}, d16
-	vtbl.8	d11, {q15}, d17
-	veor	q11, q7, q9
-	vtbl.8	d12, {q10}, d16
-	vtbl.8	d13, {q10}, d17
-	vtbl.8	d14, {q11}, d16
-	vtbl.8	d15, {q11}, d17
-	vmov.i8	q8,#0x55			@ compose LBS0
-	vmov.i8	q9,#0x33			@ compose LBS1
-	vshr.u64	q10, q6, #1
-	vshr.u64	q11, q4, #1
-	veor	q10, q10, q7
-	veor	q11, q11, q5
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #1
-	veor	q5, q5, q11
-	vshl.u64	q11, q11, #1
-	veor	q6, q6, q10
-	veor	q4, q4, q11
-	vshr.u64	q10, q2, #1
-	vshr.u64	q11, q0, #1
-	veor	q10, q10, q3
-	veor	q11, q11, q1
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q3, q3, q10
-	vshl.u64	q10, q10, #1
-	veor	q1, q1, q11
-	vshl.u64	q11, q11, #1
-	veor	q2, q2, q10
-	veor	q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose LBS2
-	vshr.u64	q10, q5, #2
-	vshr.u64	q11, q4, #2
-	veor	q10, q10, q7
-	veor	q11, q11, q6
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #2
-	veor	q6, q6, q11
-	vshl.u64	q11, q11, #2
-	veor	q5, q5, q10
-	veor	q4, q4, q11
-	vshr.u64	q10, q1, #2
-	vshr.u64	q11, q0, #2
-	veor	q10, q10, q3
-	veor	q11, q11, q2
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q3, q3, q10
-	vshl.u64	q10, q10, #2
-	veor	q2, q2, q11
-	vshl.u64	q11, q11, #2
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vshr.u64	q10, q3, #4
-	vshr.u64	q11, q2, #4
-	veor	q10, q10, q7
-	veor	q11, q11, q6
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #4
-	veor	q6, q6, q11
-	vshl.u64	q11, q11, #4
-	veor	q3, q3, q10
-	veor	q2, q2, q11
-	vshr.u64	q10, q1, #4
-	vshr.u64	q11, q0, #4
-	veor	q10, q10, q5
-	veor	q11, q11, q4
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #4
-	veor	q4, q4, q11
-	vshl.u64	q11, q11, #4
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	sub	r5,r5,#1
-	b	Ldec_sbox
-.align	4
-Ldec_loop:
-	vldmia	r4!, {q8,q9,q10,q11}
-	veor	q8, q8, q0
-	veor	q9, q9, q1
-	vtbl.8	d0, {q8}, d24
-	vtbl.8	d1, {q8}, d25
-	vldmia	r4!, {q8}
-	veor	q10, q10, q2
-	vtbl.8	d2, {q9}, d24
-	vtbl.8	d3, {q9}, d25
-	vldmia	r4!, {q9}
-	veor	q11, q11, q3
-	vtbl.8	d4, {q10}, d24
-	vtbl.8	d5, {q10}, d25
-	vldmia	r4!, {q10}
-	vtbl.8	d6, {q11}, d24
-	vtbl.8	d7, {q11}, d25
-	vldmia	r4!, {q11}
-	veor	q8, q8, q4
-	veor	q9, q9, q5
-	vtbl.8	d8, {q8}, d24
-	vtbl.8	d9, {q8}, d25
-	veor	q10, q10, q6
-	vtbl.8	d10, {q9}, d24
-	vtbl.8	d11, {q9}, d25
-	veor	q11, q11, q7
-	vtbl.8	d12, {q10}, d24
-	vtbl.8	d13, {q10}, d25
-	vtbl.8	d14, {q11}, d24
-	vtbl.8	d15, {q11}, d25
-Ldec_sbox:
-	veor	q1, q1, q4
-	veor	q3, q3, q4
-
-	veor	q4, q4, q7
-	veor	q1, q1, q6
-	veor	q2, q2, q7
-	veor	q6, q6, q4
-
-	veor	q0, q0, q1
-	veor	q2, q2, q5
-	veor	q7, q7, q6
-	veor	q3, q3, q0
-	veor	q5, q5, q0
-	veor	q1, q1, q3
-	veor	q11, q3, q0
-	veor	q10, q7, q4
-	veor	q9, q1, q6
-	veor	q13, q4, q0
-	vmov	q8, q10
-	veor	q12, q5, q2
-
-	vorr	q10, q10, q9
-	veor	q15, q11, q8
-	vand	q14, q11, q12
-	vorr	q11, q11, q12
-	veor	q12, q12, q9
-	vand	q8, q8, q9
-	veor	q9, q6, q2
-	vand	q15, q15, q12
-	vand	q13, q13, q9
-	veor	q9, q3, q7
-	veor	q12, q1, q5
-	veor	q11, q11, q13
-	veor	q10, q10, q13
-	vand	q13, q9, q12
-	vorr	q9, q9, q12
-	veor	q11, q11, q15
-	veor	q8, q8, q13
-	veor	q10, q10, q14
-	veor	q9, q9, q15
-	veor	q8, q8, q14
-	vand	q12, q4, q6
-	veor	q9, q9, q14
-	vand	q13, q0, q2
-	vand	q14, q7, q1
-	vorr	q15, q3, q5
-	veor	q11, q11, q12
-	veor	q9, q9, q14
-	veor	q8, q8, q15
-	veor	q10, q10, q13
-
-	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
-
-	@ new smaller inversion
-
-	vand	q14, q11, q9
-	vmov	q12, q8
-
-	veor	q13, q10, q14
-	veor	q15, q8, q14
-	veor	q14, q8, q14	@ q14=q15
-
-	vbsl	q13, q9, q8
-	vbsl	q15, q11, q10
-	veor	q11, q11, q10
-
-	vbsl	q12, q13, q14
-	vbsl	q8, q14, q13
-
-	vand	q14, q12, q15
-	veor	q9, q9, q8
-
-	veor	q14, q14, q11
-	veor	q12, q5, q2
-	veor	q8, q1, q6
-	veor	q10, q15, q14
-	vand	q10, q10, q5
-	veor	q5, q5, q1
-	vand	q11, q1, q15
-	vand	q5, q5, q14
-	veor	q1, q11, q10
-	veor	q5, q5, q11
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor	q11, q15, q14
-	veor	q10, q13, q9
-	vand	q11, q11, q12
-	vand	q10, q10, q2
-	veor	q12, q12, q8
-	veor	q2, q2, q6
-	vand	q8, q8, q15
-	vand	q6, q6, q13
-	vand	q12, q12, q14
-	vand	q2, q2, q9
-	veor	q8, q8, q12
-	veor	q2, q2, q6
-	veor	q12, q12, q11
-	veor	q6, q6, q10
-	veor	q5, q5, q12
-	veor	q2, q2, q12
-	veor	q1, q1, q8
-	veor	q6, q6, q8
-
-	veor	q12, q3, q0
-	veor	q8, q7, q4
-	veor	q11, q15, q14
-	veor	q10, q13, q9
-	vand	q11, q11, q12
-	vand	q10, q10, q0
-	veor	q12, q12, q8
-	veor	q0, q0, q4
-	vand	q8, q8, q15
-	vand	q4, q4, q13
-	vand	q12, q12, q14
-	vand	q0, q0, q9
-	veor	q8, q8, q12
-	veor	q0, q0, q4
-	veor	q12, q12, q11
-	veor	q4, q4, q10
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor	q10, q15, q14
-	vand	q10, q10, q3
-	veor	q3, q3, q7
-	vand	q11, q7, q15
-	vand	q3, q3, q14
-	veor	q7, q11, q10
-	veor	q3, q3, q11
-	veor	q3, q3, q12
-	veor	q0, q0, q12
-	veor	q7, q7, q8
-	veor	q4, q4, q8
-	veor	q1, q1, q7
-	veor	q6, q6, q5
-
-	veor	q4, q4, q1
-	veor	q2, q2, q7
-	veor	q5, q5, q7
-	veor	q4, q4, q2
-	veor	q7, q7, q0
-	veor	q4, q4, q5
-	veor	q3, q3, q6
-	veor	q6, q6, q1
-	veor	q3, q3, q4
-
-	veor	q4, q4, q0
-	veor	q7, q7, q3
-	subs	r5,r5,#1
-	bcc	Ldec_done
-	@ multiplication by 0x05-0x00-0x04-0x00
-	vext.8	q8, q0, q0, #8
-	vext.8	q14, q3, q3, #8
-	vext.8	q15, q5, q5, #8
-	veor	q8, q8, q0
-	vext.8	q9, q1, q1, #8
-	veor	q14, q14, q3
-	vext.8	q10, q6, q6, #8
-	veor	q15, q15, q5
-	vext.8	q11, q4, q4, #8
-	veor	q9, q9, q1
-	vext.8	q12, q2, q2, #8
-	veor	q10, q10, q6
-	vext.8	q13, q7, q7, #8
-	veor	q11, q11, q4
-	veor	q12, q12, q2
-	veor	q13, q13, q7
-
-	veor	q0, q0, q14
-	veor	q1, q1, q14
-	veor	q6, q6, q8
-	veor	q2, q2, q10
-	veor	q4, q4, q9
-	veor	q1, q1, q15
-	veor	q6, q6, q15
-	veor	q2, q2, q14
-	veor	q7, q7, q11
-	veor	q4, q4, q14
-	veor	q3, q3, q12
-	veor	q2, q2, q15
-	veor	q7, q7, q15
-	veor	q5, q5, q13
-	vext.8	q8, q0, q0, #12	@ x0 <<< 32
-	vext.8	q9, q1, q1, #12
-	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
-	vext.8	q10, q6, q6, #12
-	veor	q1, q1, q9
-	vext.8	q11, q4, q4, #12
-	veor	q6, q6, q10
-	vext.8	q12, q2, q2, #12
-	veor	q4, q4, q11
-	vext.8	q13, q7, q7, #12
-	veor	q2, q2, q12
-	vext.8	q14, q3, q3, #12
-	veor	q7, q7, q13
-	vext.8	q15, q5, q5, #12
-	veor	q3, q3, q14
-
-	veor	q9, q9, q0
-	veor	q5, q5, q15
-	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
-	veor	q10, q10, q1
-	veor	q8, q8, q5
-	veor	q9, q9, q5
-	vext.8	q1, q1, q1, #8
-	veor	q13, q13, q2
-	veor	q0, q0, q8
-	veor	q14, q14, q7
-	veor	q1, q1, q9
-	vext.8	q8, q2, q2, #8
-	veor	q12, q12, q4
-	vext.8	q9, q7, q7, #8
-	veor	q15, q15, q3
-	vext.8	q2, q4, q4, #8
-	veor	q11, q11, q6
-	vext.8	q7, q5, q5, #8
-	veor	q12, q12, q5
-	vext.8	q4, q3, q3, #8
-	veor	q11, q11, q5
-	vext.8	q3, q6, q6, #8
-	veor	q5, q9, q13
-	veor	q11, q11, q2
-	veor	q7, q7, q15
-	veor	q6, q4, q14
-	veor	q4, q8, q12
-	veor	q2, q3, q10
-	vmov	q3, q11
-	 @ vmov	q5, q9
-	vldmia	r6, {q12}		@ LISR
-	ite	eq				@ Thumb2 thing, sanity check in ARM
-	addeq	r6,r6,#0x10
-	bne	Ldec_loop
-	vldmia	r6, {q12}		@ LISRM0
-	b	Ldec_loop
-.align	4
-Ldec_done:
-	vmov.i8	q8,#0x55			@ compose LBS0
-	vmov.i8	q9,#0x33			@ compose LBS1
-	vshr.u64	q10, q3, #1
-	vshr.u64	q11, q2, #1
-	veor	q10, q10, q5
-	veor	q11, q11, q7
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #1
-	veor	q7, q7, q11
-	vshl.u64	q11, q11, #1
-	veor	q3, q3, q10
-	veor	q2, q2, q11
-	vshr.u64	q10, q6, #1
-	vshr.u64	q11, q0, #1
-	veor	q10, q10, q4
-	veor	q11, q11, q1
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q4, q4, q10
-	vshl.u64	q10, q10, #1
-	veor	q1, q1, q11
-	vshl.u64	q11, q11, #1
-	veor	q6, q6, q10
-	veor	q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose LBS2
-	vshr.u64	q10, q7, #2
-	vshr.u64	q11, q2, #2
-	veor	q10, q10, q5
-	veor	q11, q11, q3
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #2
-	veor	q3, q3, q11
-	vshl.u64	q11, q11, #2
-	veor	q7, q7, q10
-	veor	q2, q2, q11
-	vshr.u64	q10, q1, #2
-	vshr.u64	q11, q0, #2
-	veor	q10, q10, q4
-	veor	q11, q11, q6
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q4, q4, q10
-	vshl.u64	q10, q10, #2
-	veor	q6, q6, q11
-	vshl.u64	q11, q11, #2
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vshr.u64	q10, q4, #4
-	vshr.u64	q11, q6, #4
-	veor	q10, q10, q5
-	veor	q11, q11, q3
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #4
-	veor	q3, q3, q11
-	vshl.u64	q11, q11, #4
-	veor	q4, q4, q10
-	veor	q6, q6, q11
-	vshr.u64	q10, q1, #4
-	vshr.u64	q11, q0, #4
-	veor	q10, q10, q7
-	veor	q11, q11, q2
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #4
-	veor	q2, q2, q11
-	vshl.u64	q11, q11, #4
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vldmia	r4, {q8}			@ last round key
-	veor	q6, q6, q8
-	veor	q4, q4, q8
-	veor	q2, q2, q8
-	veor	q7, q7, q8
-	veor	q3, q3, q8
-	veor	q5, q5, q8
-	veor	q0, q0, q8
-	veor	q1, q1, q8
-	bx	lr
-
-
-
-.align	6
-_bsaes_const:
-LM0ISR:@ InvShiftRows constants
-.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
-LISR:
-.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
-LISRM0:
-.quad	0x01040b0e0205080f, 0x0306090c00070a0d
-LM0SR:@ ShiftRows constants
-.quad	0x0a0e02060f03070b, 0x0004080c05090d01
-LSR:
-.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
-LSRM0:
-.quad	0x0304090e00050a0f, 0x01060b0c0207080d
-LM0:
-.quad	0x02060a0e03070b0f, 0x0004080c0105090d
-LREVM0SR:
-.quad	0x090d01050c000408, 0x03070b0f060a0e02
-.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	6
-
-
-#ifdef __thumb2__
-.thumb_func	_bsaes_encrypt8
-#endif
-.align	4
-_bsaes_encrypt8:
-	adr	r6,.
-	vldmia	r4!, {q9}		@ round 0 key
-#if defined(__thumb2__) || defined(__APPLE__)
-	adr	r6,LM0SR
-#else
-	sub	r6,r6,#_bsaes_encrypt8-LM0SR
-#endif
-
-	vldmia	r6!, {q8}		@ LM0SR
-_bsaes_encrypt8_alt:
-	veor	q10, q0, q9	@ xor with round0 key
-	veor	q11, q1, q9
-	vtbl.8	d0, {q10}, d16
-	vtbl.8	d1, {q10}, d17
-	veor	q12, q2, q9
-	vtbl.8	d2, {q11}, d16
-	vtbl.8	d3, {q11}, d17
-	veor	q13, q3, q9
-	vtbl.8	d4, {q12}, d16
-	vtbl.8	d5, {q12}, d17
-	veor	q14, q4, q9
-	vtbl.8	d6, {q13}, d16
-	vtbl.8	d7, {q13}, d17
-	veor	q15, q5, q9
-	vtbl.8	d8, {q14}, d16
-	vtbl.8	d9, {q14}, d17
-	veor	q10, q6, q9
-	vtbl.8	d10, {q15}, d16
-	vtbl.8	d11, {q15}, d17
-	veor	q11, q7, q9
-	vtbl.8	d12, {q10}, d16
-	vtbl.8	d13, {q10}, d17
-	vtbl.8	d14, {q11}, d16
-	vtbl.8	d15, {q11}, d17
-_bsaes_encrypt8_bitslice:
-	vmov.i8	q8,#0x55			@ compose LBS0
-	vmov.i8	q9,#0x33			@ compose LBS1
-	vshr.u64	q10, q6, #1
-	vshr.u64	q11, q4, #1
-	veor	q10, q10, q7
-	veor	q11, q11, q5
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #1
-	veor	q5, q5, q11
-	vshl.u64	q11, q11, #1
-	veor	q6, q6, q10
-	veor	q4, q4, q11
-	vshr.u64	q10, q2, #1
-	vshr.u64	q11, q0, #1
-	veor	q10, q10, q3
-	veor	q11, q11, q1
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q3, q3, q10
-	vshl.u64	q10, q10, #1
-	veor	q1, q1, q11
-	vshl.u64	q11, q11, #1
-	veor	q2, q2, q10
-	veor	q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose LBS2
-	vshr.u64	q10, q5, #2
-	vshr.u64	q11, q4, #2
-	veor	q10, q10, q7
-	veor	q11, q11, q6
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #2
-	veor	q6, q6, q11
-	vshl.u64	q11, q11, #2
-	veor	q5, q5, q10
-	veor	q4, q4, q11
-	vshr.u64	q10, q1, #2
-	vshr.u64	q11, q0, #2
-	veor	q10, q10, q3
-	veor	q11, q11, q2
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q3, q3, q10
-	vshl.u64	q10, q10, #2
-	veor	q2, q2, q11
-	vshl.u64	q11, q11, #2
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vshr.u64	q10, q3, #4
-	vshr.u64	q11, q2, #4
-	veor	q10, q10, q7
-	veor	q11, q11, q6
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #4
-	veor	q6, q6, q11
-	vshl.u64	q11, q11, #4
-	veor	q3, q3, q10
-	veor	q2, q2, q11
-	vshr.u64	q10, q1, #4
-	vshr.u64	q11, q0, #4
-	veor	q10, q10, q5
-	veor	q11, q11, q4
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #4
-	veor	q4, q4, q11
-	vshl.u64	q11, q11, #4
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	sub	r5,r5,#1
-	b	Lenc_sbox
-.align	4
-Lenc_loop:
-	vldmia	r4!, {q8,q9,q10,q11}
-	veor	q8, q8, q0
-	veor	q9, q9, q1
-	vtbl.8	d0, {q8}, d24
-	vtbl.8	d1, {q8}, d25
-	vldmia	r4!, {q8}
-	veor	q10, q10, q2
-	vtbl.8	d2, {q9}, d24
-	vtbl.8	d3, {q9}, d25
-	vldmia	r4!, {q9}
-	veor	q11, q11, q3
-	vtbl.8	d4, {q10}, d24
-	vtbl.8	d5, {q10}, d25
-	vldmia	r4!, {q10}
-	vtbl.8	d6, {q11}, d24
-	vtbl.8	d7, {q11}, d25
-	vldmia	r4!, {q11}
-	veor	q8, q8, q4
-	veor	q9, q9, q5
-	vtbl.8	d8, {q8}, d24
-	vtbl.8	d9, {q8}, d25
-	veor	q10, q10, q6
-	vtbl.8	d10, {q9}, d24
-	vtbl.8	d11, {q9}, d25
-	veor	q11, q11, q7
-	vtbl.8	d12, {q10}, d24
-	vtbl.8	d13, {q10}, d25
-	vtbl.8	d14, {q11}, d24
-	vtbl.8	d15, {q11}, d25
-Lenc_sbox:
-	veor	q2, q2, q1
-	veor	q5, q5, q6
-	veor	q3, q3, q0
-	veor	q6, q6, q2
-	veor	q5, q5, q0
-
-	veor	q6, q6, q3
-	veor	q3, q3, q7
-	veor	q7, q7, q5
-	veor	q3, q3, q4
-	veor	q4, q4, q5
-
-	veor	q2, q2, q7
-	veor	q3, q3, q1
-	veor	q1, q1, q5
-	veor	q11, q7, q4
-	veor	q10, q1, q2
-	veor	q9, q5, q3
-	veor	q13, q2, q4
-	vmov	q8, q10
-	veor	q12, q6, q0
-
-	vorr	q10, q10, q9
-	veor	q15, q11, q8
-	vand	q14, q11, q12
-	vorr	q11, q11, q12
-	veor	q12, q12, q9
-	vand	q8, q8, q9
-	veor	q9, q3, q0
-	vand	q15, q15, q12
-	vand	q13, q13, q9
-	veor	q9, q7, q1
-	veor	q12, q5, q6
-	veor	q11, q11, q13
-	veor	q10, q10, q13
-	vand	q13, q9, q12
-	vorr	q9, q9, q12
-	veor	q11, q11, q15
-	veor	q8, q8, q13
-	veor	q10, q10, q14
-	veor	q9, q9, q15
-	veor	q8, q8, q14
-	vand	q12, q2, q3
-	veor	q9, q9, q14
-	vand	q13, q4, q0
-	vand	q14, q1, q5
-	vorr	q15, q7, q6
-	veor	q11, q11, q12
-	veor	q9, q9, q14
-	veor	q8, q8, q15
-	veor	q10, q10, q13
-
-	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
-
-	@ new smaller inversion
-
-	vand	q14, q11, q9
-	vmov	q12, q8
-
-	veor	q13, q10, q14
-	veor	q15, q8, q14
-	veor	q14, q8, q14	@ q14=q15
-
-	vbsl	q13, q9, q8
-	vbsl	q15, q11, q10
-	veor	q11, q11, q10
-
-	vbsl	q12, q13, q14
-	vbsl	q8, q14, q13
-
-	vand	q14, q12, q15
-	veor	q9, q9, q8
-
-	veor	q14, q14, q11
-	veor	q12, q6, q0
-	veor	q8, q5, q3
-	veor	q10, q15, q14
-	vand	q10, q10, q6
-	veor	q6, q6, q5
-	vand	q11, q5, q15
-	vand	q6, q6, q14
-	veor	q5, q11, q10
-	veor	q6, q6, q11
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor	q11, q15, q14
-	veor	q10, q13, q9
-	vand	q11, q11, q12
-	vand	q10, q10, q0
-	veor	q12, q12, q8
-	veor	q0, q0, q3
-	vand	q8, q8, q15
-	vand	q3, q3, q13
-	vand	q12, q12, q14
-	vand	q0, q0, q9
-	veor	q8, q8, q12
-	veor	q0, q0, q3
-	veor	q12, q12, q11
-	veor	q3, q3, q10
-	veor	q6, q6, q12
-	veor	q0, q0, q12
-	veor	q5, q5, q8
-	veor	q3, q3, q8
-
-	veor	q12, q7, q4
-	veor	q8, q1, q2
-	veor	q11, q15, q14
-	veor	q10, q13, q9
-	vand	q11, q11, q12
-	vand	q10, q10, q4
-	veor	q12, q12, q8
-	veor	q4, q4, q2
-	vand	q8, q8, q15
-	vand	q2, q2, q13
-	vand	q12, q12, q14
-	vand	q4, q4, q9
-	veor	q8, q8, q12
-	veor	q4, q4, q2
-	veor	q12, q12, q11
-	veor	q2, q2, q10
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor	q10, q15, q14
-	vand	q10, q10, q7
-	veor	q7, q7, q1
-	vand	q11, q1, q15
-	vand	q7, q7, q14
-	veor	q1, q11, q10
-	veor	q7, q7, q11
-	veor	q7, q7, q12
-	veor	q4, q4, q12
-	veor	q1, q1, q8
-	veor	q2, q2, q8
-	veor	q7, q7, q0
-	veor	q1, q1, q6
-	veor	q6, q6, q0
-	veor	q4, q4, q7
-	veor	q0, q0, q1
-
-	veor	q1, q1, q5
-	veor	q5, q5, q2
-	veor	q2, q2, q3
-	veor	q3, q3, q5
-	veor	q4, q4, q5
-
-	veor	q6, q6, q3
-	subs	r5,r5,#1
-	bcc	Lenc_done
-	vext.8	q8, q0, q0, #12	@ x0 <<< 32
-	vext.8	q9, q1, q1, #12
-	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
-	vext.8	q10, q4, q4, #12
-	veor	q1, q1, q9
-	vext.8	q11, q6, q6, #12
-	veor	q4, q4, q10
-	vext.8	q12, q3, q3, #12
-	veor	q6, q6, q11
-	vext.8	q13, q7, q7, #12
-	veor	q3, q3, q12
-	vext.8	q14, q2, q2, #12
-	veor	q7, q7, q13
-	vext.8	q15, q5, q5, #12
-	veor	q2, q2, q14
-
-	veor	q9, q9, q0
-	veor	q5, q5, q15
-	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
-	veor	q10, q10, q1
-	veor	q8, q8, q5
-	veor	q9, q9, q5
-	vext.8	q1, q1, q1, #8
-	veor	q13, q13, q3
-	veor	q0, q0, q8
-	veor	q14, q14, q7
-	veor	q1, q1, q9
-	vext.8	q8, q3, q3, #8
-	veor	q12, q12, q6
-	vext.8	q9, q7, q7, #8
-	veor	q15, q15, q2
-	vext.8	q3, q6, q6, #8
-	veor	q11, q11, q4
-	vext.8	q7, q5, q5, #8
-	veor	q12, q12, q5
-	vext.8	q6, q2, q2, #8
-	veor	q11, q11, q5
-	vext.8	q2, q4, q4, #8
-	veor	q5, q9, q13
-	veor	q4, q8, q12
-	veor	q3, q3, q11
-	veor	q7, q7, q15
-	veor	q6, q6, q14
-	 @ vmov	q4, q8
-	veor	q2, q2, q10
-	 @ vmov	q5, q9
-	vldmia	r6, {q12}		@ LSR
-	ite	eq				@ Thumb2 thing, samity check in ARM
-	addeq	r6,r6,#0x10
-	bne	Lenc_loop
-	vldmia	r6, {q12}		@ LSRM0
-	b	Lenc_loop
-.align	4
-Lenc_done:
-	vmov.i8	q8,#0x55			@ compose LBS0
-	vmov.i8	q9,#0x33			@ compose LBS1
-	vshr.u64	q10, q2, #1
-	vshr.u64	q11, q3, #1
-	veor	q10, q10, q5
-	veor	q11, q11, q7
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #1
-	veor	q7, q7, q11
-	vshl.u64	q11, q11, #1
-	veor	q2, q2, q10
-	veor	q3, q3, q11
-	vshr.u64	q10, q4, #1
-	vshr.u64	q11, q0, #1
-	veor	q10, q10, q6
-	veor	q11, q11, q1
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q6, q6, q10
-	vshl.u64	q10, q10, #1
-	veor	q1, q1, q11
-	vshl.u64	q11, q11, #1
-	veor	q4, q4, q10
-	veor	q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose LBS2
-	vshr.u64	q10, q7, #2
-	vshr.u64	q11, q3, #2
-	veor	q10, q10, q5
-	veor	q11, q11, q2
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #2
-	veor	q2, q2, q11
-	vshl.u64	q11, q11, #2
-	veor	q7, q7, q10
-	veor	q3, q3, q11
-	vshr.u64	q10, q1, #2
-	vshr.u64	q11, q0, #2
-	veor	q10, q10, q6
-	veor	q11, q11, q4
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q6, q6, q10
-	vshl.u64	q10, q10, #2
-	veor	q4, q4, q11
-	vshl.u64	q11, q11, #2
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vshr.u64	q10, q6, #4
-	vshr.u64	q11, q4, #4
-	veor	q10, q10, q5
-	veor	q11, q11, q2
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #4
-	veor	q2, q2, q11
-	vshl.u64	q11, q11, #4
-	veor	q6, q6, q10
-	veor	q4, q4, q11
-	vshr.u64	q10, q1, #4
-	vshr.u64	q11, q0, #4
-	veor	q10, q10, q7
-	veor	q11, q11, q3
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #4
-	veor	q3, q3, q11
-	vshl.u64	q11, q11, #4
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vldmia	r4, {q8}			@ last round key
-	veor	q4, q4, q8
-	veor	q6, q6, q8
-	veor	q3, q3, q8
-	veor	q7, q7, q8
-	veor	q2, q2, q8
-	veor	q5, q5, q8
-	veor	q0, q0, q8
-	veor	q1, q1, q8
-	bx	lr
-
-#ifdef __thumb2__
-.thumb_func	_bsaes_key_convert
-#endif
-.align	4
-_bsaes_key_convert:
-	adr	r6,.
-	vld1.8	{q7},  [r4]!		@ load round 0 key
-#if defined(__thumb2__) || defined(__APPLE__)
-	adr	r6,LM0
-#else
-	sub	r6,r6,#_bsaes_key_convert-LM0
-#endif
-	vld1.8	{q15}, [r4]!		@ load round 1 key
-
-	vmov.i8	q8,  #0x01			@ bit masks
-	vmov.i8	q9,  #0x02
-	vmov.i8	q10, #0x04
-	vmov.i8	q11, #0x08
-	vmov.i8	q12, #0x10
-	vmov.i8	q13, #0x20
-	vldmia	r6, {q14}		@ LM0
-
-#ifdef __ARMEL__
-	vrev32.8	q7,  q7
-	vrev32.8	q15, q15
-#endif
-	sub	r5,r5,#1
-	vstmia	r12!, {q7}		@ save round 0 key
-	b	Lkey_loop
-
-.align	4
-Lkey_loop:
-	vtbl.8	d14,{q15},d28
-	vtbl.8	d15,{q15},d29
-	vmov.i8	q6,  #0x40
-	vmov.i8	q15, #0x80
-
-	vtst.8	q0, q7, q8
-	vtst.8	q1, q7, q9
-	vtst.8	q2, q7, q10
-	vtst.8	q3, q7, q11
-	vtst.8	q4, q7, q12
-	vtst.8	q5, q7, q13
-	vtst.8	q6, q7, q6
-	vtst.8	q7, q7, q15
-	vld1.8	{q15}, [r4]!		@ load next round key
-	vmvn	q0, q0		@ "pnot"
-	vmvn	q1, q1
-	vmvn	q5, q5
-	vmvn	q6, q6
-#ifdef __ARMEL__
-	vrev32.8	q15, q15
-#endif
-	subs	r5,r5,#1
-	vstmia	r12!,{q0,q1,q2,q3,q4,q5,q6,q7}		@ write bit-sliced round key
-	bne	Lkey_loop
-
-	vmov.i8	q7,#0x63			@ compose L63
-	@ don't save last round key
-	bx	lr
-
-.globl	_bsaes_cbc_encrypt
-.private_extern	_bsaes_cbc_encrypt
-#ifdef __thumb2__
-.thumb_func	_bsaes_cbc_encrypt
-#endif
-.align	5
-_bsaes_cbc_encrypt:
-	@ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
-	@ short inputs. We patch this out, using bsaes for all input sizes.
-
-	@ it is up to the caller to make sure we are called with enc == 0
-
-	mov	ip, sp
-	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
-	VFP_ABI_PUSH
-	ldr	r8, [ip]			@ IV is 1st arg on the stack
-	mov	r2, r2, lsr#4		@ len in 16 byte blocks
-	sub	sp, #0x10			@ scratch space to carry over the IV
-	mov	r9, sp				@ save sp
-
-	ldr	r10, [r3, #240]		@ get # of rounds
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
-	add	r12, #96			@ sifze of bit-slices key schedule
-
-	@ populate the key schedule
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	mov	sp, r12				@ sp is sp
-	bl	_bsaes_key_convert
-	vldmia	sp, {q6}
-	vstmia	r12,  {q15}		@ save last round key
-	veor	q7, q7, q6	@ fix up round 0 key
-	vstmia	sp, {q7}
-#else
-	ldr	r12, [r3, #244]
-	eors	r12, #1
-	beq	0f
-
-	@ populate the key schedule
-	str	r12, [r3, #244]
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	add	r12, r3, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	add	r4, r3, #248
-	vldmia	r4, {q6}
-	vstmia	r12, {q15}			@ save last round key
-	veor	q7, q7, q6	@ fix up round 0 key
-	vstmia	r4, {q7}
-
-.align	2
-
-#endif
-
-	vld1.8	{q15}, [r8]		@ load IV
-	b	Lcbc_dec_loop
-
-.align	4
-Lcbc_dec_loop:
-	subs	r2, r2, #0x8
-	bmi	Lcbc_dec_loop_finish
-
-	vld1.8	{q0,q1}, [r0]!	@ load input
-	vld1.8	{q2,q3}, [r0]!
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	mov	r4, sp			@ pass the key
-#else
-	add	r4, r3, #248
-#endif
-	vld1.8	{q4,q5}, [r0]!
-	mov	r5, r10
-	vld1.8	{q6,q7}, [r0]
-	sub	r0, r0, #0x60
-	vstmia	r9, {q15}			@ put aside IV
-
-	bl	_bsaes_decrypt8
-
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10,q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q12,q13}, [r0]!
-	veor	q4, q4, q10
-	veor	q2, q2, q11
-	vld1.8	{q14,q15}, [r0]!
-	veor	q7, q7, q12
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	veor	q3, q3, q13
-	vst1.8	{q6}, [r1]!
-	veor	q5, q5, q14
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	vst1.8	{q7}, [r1]!
-	vst1.8	{q3}, [r1]!
-	vst1.8	{q5}, [r1]!
-
-	b	Lcbc_dec_loop
-
-Lcbc_dec_loop_finish:
-	adds	r2, r2, #8
-	beq	Lcbc_dec_done
-
-	@ Set up most parameters for the _bsaes_decrypt8 call.
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	mov	r4, sp			@ pass the key
-#else
-	add	r4, r3, #248
-#endif
-	mov	r5, r10
-	vstmia	r9, {q15}			@ put aside IV
-
-	vld1.8	{q0}, [r0]!		@ load input
-	cmp	r2, #2
-	blo	Lcbc_dec_one
-	vld1.8	{q1}, [r0]!
-	beq	Lcbc_dec_two
-	vld1.8	{q2}, [r0]!
-	cmp	r2, #4
-	blo	Lcbc_dec_three
-	vld1.8	{q3}, [r0]!
-	beq	Lcbc_dec_four
-	vld1.8	{q4}, [r0]!
-	cmp	r2, #6
-	blo	Lcbc_dec_five
-	vld1.8	{q5}, [r0]!
-	beq	Lcbc_dec_six
-	vld1.8	{q6}, [r0]!
-	sub	r0, r0, #0x70
-
-	bl	_bsaes_decrypt8
-
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10,q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q12,q13}, [r0]!
-	veor	q4, q4, q10
-	veor	q2, q2, q11
-	vld1.8	{q15}, [r0]!
-	veor	q7, q7, q12
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	veor	q3, q3, q13
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	vst1.8	{q7}, [r1]!
-	vst1.8	{q3}, [r1]!
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_six:
-	sub	r0, r0, #0x60
-	bl	_bsaes_decrypt8
-	vldmia	r9,{q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10,q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q12}, [r0]!
-	veor	q4, q4, q10
-	veor	q2, q2, q11
-	vld1.8	{q15}, [r0]!
-	veor	q7, q7, q12
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	vst1.8	{q7}, [r1]!
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_five:
-	sub	r0, r0, #0x50
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10,q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q15}, [r0]!
-	veor	q4, q4, q10
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	veor	q2, q2, q11
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_four:
-	sub	r0, r0, #0x40
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q15}, [r0]!
-	veor	q4, q4, q10
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_three:
-	sub	r0, r0, #0x30
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q15}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	vst1.8	{q6}, [r1]!
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_two:
-	sub	r0, r0, #0x20
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8}, [r0]!		@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q15}, [r0]!		@ reload input
-	veor	q1, q1, q8
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_one:
-	sub	r0, r0, #0x10
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q15}, [r0]!		@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vst1.8	{q0}, [r1]!		@ write output
-
-Lcbc_dec_done:
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-Lcbc_dec_bzero:@ wipe key schedule [if any]
-	vstmia	sp!, {q0,q1}
-	cmp	sp, r9
-	bne	Lcbc_dec_bzero
-#endif
-
-	mov	sp, r9
-	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
-	vst1.8	{q15}, [r8]		@ return IV
-	VFP_ABI_POP
-	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
-
-.globl	_bsaes_ctr32_encrypt_blocks
-.private_extern	_bsaes_ctr32_encrypt_blocks
-#ifdef __thumb2__
-.thumb_func	_bsaes_ctr32_encrypt_blocks
-#endif
-.align	5
-_bsaes_ctr32_encrypt_blocks:
-	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
-	@ out to retain a constant-time implementation.
-	mov	ip, sp
-	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
-	VFP_ABI_PUSH
-	ldr	r8, [ip]			@ ctr is 1st arg on the stack
-	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
-	mov	r9, sp				@ save sp
-
-	ldr	r10, [r3, #240]		@ get # of rounds
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
-	add	r12, #96			@ size of bit-sliced key schedule
-
-	@ populate the key schedule
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	mov	sp, r12				@ sp is sp
-	bl	_bsaes_key_convert
-	veor	q7,q7,q15	@ fix up last round key
-	vstmia	r12, {q7}			@ save last round key
-
-	vld1.8	{q0}, [r8]		@ load counter
-#ifdef	__APPLE__
-	mov	r8, #:lower16:(LREVM0SR-LM0)
-	add	r8, r6, r8
-#else
-	add	r8, r6, #LREVM0SR-LM0	@ borrow r8
-#endif
-	vldmia	sp, {q4}		@ load round0 key
-#else
-	ldr	r12, [r3, #244]
-	eors	r12, #1
-	beq	0f
-
-	@ populate the key schedule
-	str	r12, [r3, #244]
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	add	r12, r3, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	veor	q7,q7,q15	@ fix up last round key
-	vstmia	r12, {q7}			@ save last round key
-
-.align	2
-	add	r12, r3, #248
-	vld1.8	{q0}, [r8]		@ load counter
-	adrl	r8, LREVM0SR			@ borrow r8
-	vldmia	r12, {q4}			@ load round0 key
-	sub	sp, #0x10			@ place for adjusted round0 key
-#endif
-
-	vmov.i32	q8,#1		@ compose 1<<96
-	veor	q9,q9,q9
-	vrev32.8	q0,q0
-	vext.8	q8,q9,q8,#4
-	vrev32.8	q4,q4
-	vadd.u32	q9,q8,q8	@ compose 2<<96
-	vstmia	sp, {q4}		@ save adjusted round0 key
-	b	Lctr_enc_loop
-
-.align	4
-Lctr_enc_loop:
-	vadd.u32	q10, q8, q9	@ compose 3<<96
-	vadd.u32	q1, q0, q8	@ +1
-	vadd.u32	q2, q0, q9	@ +2
-	vadd.u32	q3, q0, q10	@ +3
-	vadd.u32	q4, q1, q10
-	vadd.u32	q5, q2, q10
-	vadd.u32	q6, q3, q10
-	vadd.u32	q7, q4, q10
-	vadd.u32	q10, q5, q10	@ next counter
-
-	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
-	@ to flip byte order in 32-bit counter
-
-	vldmia	sp, {q9}		@ load round0 key
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add	r4, sp, #0x10		@ pass next round key
-#else
-	add	r4, r3, #264
-#endif
-	vldmia	r8, {q8}			@ LREVM0SR
-	mov	r5, r10			@ pass rounds
-	vstmia	r9, {q10}			@ save next counter
-#ifdef	__APPLE__
-	mov	r6, #:lower16:(LREVM0SR-LSR)
-	sub	r6, r8, r6
-#else
-	sub	r6, r8, #LREVM0SR-LSR	@ pass constants
-#endif
-
-	bl	_bsaes_encrypt8_alt
-
-	subs	r2, r2, #8
-	blo	Lctr_enc_loop_done
-
-	vld1.8	{q8,q9}, [r0]!	@ load input
-	vld1.8	{q10,q11}, [r0]!
-	veor	q0, q8
-	veor	q1, q9
-	vld1.8	{q12,q13}, [r0]!
-	veor	q4, q10
-	veor	q6, q11
-	vld1.8	{q14,q15}, [r0]!
-	veor	q3, q12
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	veor	q7, q13
-	veor	q2, q14
-	vst1.8	{q4}, [r1]!
-	veor	q5, q15
-	vst1.8	{q6}, [r1]!
-	vmov.i32	q8, #1			@ compose 1<<96
-	vst1.8	{q3}, [r1]!
-	veor	q9, q9, q9
-	vst1.8	{q7}, [r1]!
-	vext.8	q8, q9, q8, #4
-	vst1.8	{q2}, [r1]!
-	vadd.u32	q9,q8,q8		@ compose 2<<96
-	vst1.8	{q5}, [r1]!
-	vldmia	r9, {q0}			@ load counter
-
-	bne	Lctr_enc_loop
-	b	Lctr_enc_done
-
-.align	4
-Lctr_enc_loop_done:
-	add	r2, r2, #8
-	vld1.8	{q8}, [r0]!	@ load input
-	veor	q0, q8
-	vst1.8	{q0}, [r1]!	@ write output
-	cmp	r2, #2
-	blo	Lctr_enc_done
-	vld1.8	{q9}, [r0]!
-	veor	q1, q9
-	vst1.8	{q1}, [r1]!
-	beq	Lctr_enc_done
-	vld1.8	{q10}, [r0]!
-	veor	q4, q10
-	vst1.8	{q4}, [r1]!
-	cmp	r2, #4
-	blo	Lctr_enc_done
-	vld1.8	{q11}, [r0]!
-	veor	q6, q11
-	vst1.8	{q6}, [r1]!
-	beq	Lctr_enc_done
-	vld1.8	{q12}, [r0]!
-	veor	q3, q12
-	vst1.8	{q3}, [r1]!
-	cmp	r2, #6
-	blo	Lctr_enc_done
-	vld1.8	{q13}, [r0]!
-	veor	q7, q13
-	vst1.8	{q7}, [r1]!
-	beq	Lctr_enc_done
-	vld1.8	{q14}, [r0]
-	veor	q2, q14
-	vst1.8	{q2}, [r1]!
-
-Lctr_enc_done:
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-#ifndef	BSAES_ASM_EXTENDED_KEY
-Lctr_enc_bzero:@ wipe key schedule [if any]
-	vstmia	sp!, {q0,q1}
-	cmp	sp, r9
-	bne	Lctr_enc_bzero
-#else
-	vstmia	sp, {q0,q1}
-#endif
-
-	mov	sp, r9
-	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
-	VFP_ABI_POP
-	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
-
-	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
-	@ out to retain a constant-time implementation.
-
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
-#endif  // defined(__arm__) && defined(__APPLE__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/aead.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/aead.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/aead.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/aead.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/cipher.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/cipher.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/cipher.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/cipher.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c.inc
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c.inc
index 12387394..57f8d16e 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aes.c.inc
@@ -56,11 +56,11 @@
 #include <CCryptoBoringSSL_err.h>
 #include <CCryptoBoringSSL_mem.h>
 #include <CCryptoBoringSSL_nid.h>
-#include <CCryptoBoringSSL_rand.h>
 
 #include "internal.h"
 #include "../../internal.h"
 #include "../aes/internal.h"
+#include "../bcm_interface.h"
 #include "../modes/internal.h"
 #include "../service_indicator/internal.h"
 #include "../delocate.h"
@@ -471,11 +471,11 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {
       }
       OPENSSL_memcpy(gctx->iv, ptr, arg);
       if (c->encrypt) {
-        // |RAND_bytes| calls within the fipsmodule should be wrapped with state
-        // lock functions to avoid updating the service indicator with the DRBG
-        // functions.
+        // |BCM_rand_bytes| calls within the fipsmodule should be wrapped with
+        // state lock functions to avoid updating the service indicator with the
+        // DRBG functions.
         FIPS_service_indicator_lock_state();
-        RAND_bytes(gctx->iv + arg, gctx->ivlen - arg);
+        BCM_rand_bytes(gctx->iv + arg, gctx->ivlen - arg);
         FIPS_service_indicator_unlock_state();
       }
       gctx->iv_gen = 1;
@@ -1167,10 +1167,11 @@ static int aead_aes_gcm_seal_scatter_randnonce(
     return 0;
   }
 
-  // |RAND_bytes| calls within the fipsmodule should be wrapped with state lock
-  // functions to avoid updating the service indicator with the DRBG functions.
+  // |BCM_rand_bytes| calls within the fipsmodule should be wrapped with state
+  // lock functions to avoid updating the service indicator with the DRBG
+  // functions.
   FIPS_service_indicator_lock_state();
-  RAND_bytes(nonce, sizeof(nonce));
+  BCM_rand_bytes(nonce, sizeof(nonce));
   FIPS_service_indicator_unlock_state();
 
   const struct aead_aes_gcm_ctx *gcm_ctx =
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aesccm.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aesccm.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aesccm.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/e_aesccm.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/internal.h
index fe960b63..3f00614c 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cipher/internal.h
@@ -134,9 +134,6 @@ struct evp_cipher_st {
   // flags contains the OR of a number of flags. See |EVP_CIPH_*|.
   uint32_t flags;
 
-  // app_data is a pointer to opaque, user data.
-  void *app_data;
-
   int (*init)(EVP_CIPHER_CTX *ctx, const uint8_t *key, const uint8_t *iv,
               int enc);
 
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/cmac/cmac.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/cmac/cmac.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/cmac/cmac.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/cmac/cmac.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-windows.windows.x86.S
deleted file mode 100644
index 763579fd..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-windows.windows.x86.S
+++ /dev/null
@@ -1,1270 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-global	_bn_mul_comba8
-align	16
-_bn_mul_comba8:
-L$_bn_mul_comba8_begin:
-	push	esi
-	mov	esi,DWORD [12+esp]
-	push	edi
-	mov	edi,DWORD [20+esp]
-	push	ebp
-	push	ebx
-	xor	ebx,ebx
-	mov	eax,DWORD [esi]
-	xor	ecx,ecx
-	mov	edx,DWORD [edi]
-	; ################## Calculate word 0
-	xor	ebp,ebp
-	; mul a[0]*b[0]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ecx,edx
-	mov	edx,DWORD [edi]
-	adc	ebp,0
-	mov	DWORD [eax],ebx
-	mov	eax,DWORD [4+esi]
-	; saved r[0]
-	; ################## Calculate word 1
-	xor	ebx,ebx
-	; mul a[1]*b[0]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [esi]
-	adc	ebp,edx
-	mov	edx,DWORD [4+edi]
-	adc	ebx,0
-	; mul a[0]*b[1]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebp,edx
-	mov	edx,DWORD [edi]
-	adc	ebx,0
-	mov	DWORD [4+eax],ecx
-	mov	eax,DWORD [8+esi]
-	; saved r[1]
-	; ################## Calculate word 2
-	xor	ecx,ecx
-	; mul a[2]*b[0]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [4+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [4+edi]
-	adc	ecx,0
-	; mul a[1]*b[1]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [esi]
-	adc	ebx,edx
-	mov	edx,DWORD [8+edi]
-	adc	ecx,0
-	; mul a[0]*b[2]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebx,edx
-	mov	edx,DWORD [edi]
-	adc	ecx,0
-	mov	DWORD [8+eax],ebp
-	mov	eax,DWORD [12+esi]
-	; saved r[2]
-	; ################## Calculate word 3
-	xor	ebp,ebp
-	; mul a[3]*b[0]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [8+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [4+edi]
-	adc	ebp,0
-	; mul a[2]*b[1]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [4+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [8+edi]
-	adc	ebp,0
-	; mul a[1]*b[2]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [esi]
-	adc	ecx,edx
-	mov	edx,DWORD [12+edi]
-	adc	ebp,0
-	; mul a[0]*b[3]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ecx,edx
-	mov	edx,DWORD [edi]
-	adc	ebp,0
-	mov	DWORD [12+eax],ebx
-	mov	eax,DWORD [16+esi]
-	; saved r[3]
-	; ################## Calculate word 4
-	xor	ebx,ebx
-	; mul a[4]*b[0]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [12+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [4+edi]
-	adc	ebx,0
-	; mul a[3]*b[1]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [8+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [8+edi]
-	adc	ebx,0
-	; mul a[2]*b[2]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [4+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [12+edi]
-	adc	ebx,0
-	; mul a[1]*b[3]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [esi]
-	adc	ebp,edx
-	mov	edx,DWORD [16+edi]
-	adc	ebx,0
-	; mul a[0]*b[4]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebp,edx
-	mov	edx,DWORD [edi]
-	adc	ebx,0
-	mov	DWORD [16+eax],ecx
-	mov	eax,DWORD [20+esi]
-	; saved r[4]
-	; ################## Calculate word 5
-	xor	ecx,ecx
-	; mul a[5]*b[0]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [16+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [4+edi]
-	adc	ecx,0
-	; mul a[4]*b[1]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [12+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [8+edi]
-	adc	ecx,0
-	; mul a[3]*b[2]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [8+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [12+edi]
-	adc	ecx,0
-	; mul a[2]*b[3]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [4+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [16+edi]
-	adc	ecx,0
-	; mul a[1]*b[4]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [esi]
-	adc	ebx,edx
-	mov	edx,DWORD [20+edi]
-	adc	ecx,0
-	; mul a[0]*b[5]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebx,edx
-	mov	edx,DWORD [edi]
-	adc	ecx,0
-	mov	DWORD [20+eax],ebp
-	mov	eax,DWORD [24+esi]
-	; saved r[5]
-	; ################## Calculate word 6
-	xor	ebp,ebp
-	; mul a[6]*b[0]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [4+edi]
-	adc	ebp,0
-	; mul a[5]*b[1]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [16+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [8+edi]
-	adc	ebp,0
-	; mul a[4]*b[2]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [12+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [12+edi]
-	adc	ebp,0
-	; mul a[3]*b[3]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [8+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [16+edi]
-	adc	ebp,0
-	; mul a[2]*b[4]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [4+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [20+edi]
-	adc	ebp,0
-	; mul a[1]*b[5]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [esi]
-	adc	ecx,edx
-	mov	edx,DWORD [24+edi]
-	adc	ebp,0
-	; mul a[0]*b[6]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ecx,edx
-	mov	edx,DWORD [edi]
-	adc	ebp,0
-	mov	DWORD [24+eax],ebx
-	mov	eax,DWORD [28+esi]
-	; saved r[6]
-	; ################## Calculate word 7
-	xor	ebx,ebx
-	; mul a[7]*b[0]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [24+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [4+edi]
-	adc	ebx,0
-	; mul a[6]*b[1]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [20+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [8+edi]
-	adc	ebx,0
-	; mul a[5]*b[2]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [16+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [12+edi]
-	adc	ebx,0
-	; mul a[4]*b[3]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [12+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [16+edi]
-	adc	ebx,0
-	; mul a[3]*b[4]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [8+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [20+edi]
-	adc	ebx,0
-	; mul a[2]*b[5]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [4+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [24+edi]
-	adc	ebx,0
-	; mul a[1]*b[6]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [esi]
-	adc	ebp,edx
-	mov	edx,DWORD [28+edi]
-	adc	ebx,0
-	; mul a[0]*b[7]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebp,edx
-	mov	edx,DWORD [4+edi]
-	adc	ebx,0
-	mov	DWORD [28+eax],ecx
-	mov	eax,DWORD [28+esi]
-	; saved r[7]
-	; ################## Calculate word 8
-	xor	ecx,ecx
-	; mul a[7]*b[1]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [24+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [8+edi]
-	adc	ecx,0
-	; mul a[6]*b[2]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [20+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [12+edi]
-	adc	ecx,0
-	; mul a[5]*b[3]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [16+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [16+edi]
-	adc	ecx,0
-	; mul a[4]*b[4]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [12+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [20+edi]
-	adc	ecx,0
-	; mul a[3]*b[5]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [8+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [24+edi]
-	adc	ecx,0
-	; mul a[2]*b[6]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [4+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [28+edi]
-	adc	ecx,0
-	; mul a[1]*b[7]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebx,edx
-	mov	edx,DWORD [8+edi]
-	adc	ecx,0
-	mov	DWORD [32+eax],ebp
-	mov	eax,DWORD [28+esi]
-	; saved r[8]
-	; ################## Calculate word 9
-	xor	ebp,ebp
-	; mul a[7]*b[2]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [24+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [12+edi]
-	adc	ebp,0
-	; mul a[6]*b[3]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [16+edi]
-	adc	ebp,0
-	; mul a[5]*b[4]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [16+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [20+edi]
-	adc	ebp,0
-	; mul a[4]*b[5]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [12+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [24+edi]
-	adc	ebp,0
-	; mul a[3]*b[6]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [8+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [28+edi]
-	adc	ebp,0
-	; mul a[2]*b[7]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ecx,edx
-	mov	edx,DWORD [12+edi]
-	adc	ebp,0
-	mov	DWORD [36+eax],ebx
-	mov	eax,DWORD [28+esi]
-	; saved r[9]
-	; ################## Calculate word 10
-	xor	ebx,ebx
-	; mul a[7]*b[3]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [24+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [16+edi]
-	adc	ebx,0
-	; mul a[6]*b[4]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [20+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [20+edi]
-	adc	ebx,0
-	; mul a[5]*b[5]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [16+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [24+edi]
-	adc	ebx,0
-	; mul a[4]*b[6]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [12+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [28+edi]
-	adc	ebx,0
-	; mul a[3]*b[7]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebp,edx
-	mov	edx,DWORD [16+edi]
-	adc	ebx,0
-	mov	DWORD [40+eax],ecx
-	mov	eax,DWORD [28+esi]
-	; saved r[10]
-	; ################## Calculate word 11
-	xor	ecx,ecx
-	; mul a[7]*b[4]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [24+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [20+edi]
-	adc	ecx,0
-	; mul a[6]*b[5]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [20+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [24+edi]
-	adc	ecx,0
-	; mul a[5]*b[6]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [16+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [28+edi]
-	adc	ecx,0
-	; mul a[4]*b[7]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebx,edx
-	mov	edx,DWORD [20+edi]
-	adc	ecx,0
-	mov	DWORD [44+eax],ebp
-	mov	eax,DWORD [28+esi]
-	; saved r[11]
-	; ################## Calculate word 12
-	xor	ebp,ebp
-	; mul a[7]*b[5]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [24+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [24+edi]
-	adc	ebp,0
-	; mul a[6]*b[6]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [28+edi]
-	adc	ebp,0
-	; mul a[5]*b[7]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ecx,edx
-	mov	edx,DWORD [24+edi]
-	adc	ebp,0
-	mov	DWORD [48+eax],ebx
-	mov	eax,DWORD [28+esi]
-	; saved r[12]
-	; ################## Calculate word 13
-	xor	ebx,ebx
-	; mul a[7]*b[6]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [24+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [28+edi]
-	adc	ebx,0
-	; mul a[6]*b[7]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebp,edx
-	mov	edx,DWORD [28+edi]
-	adc	ebx,0
-	mov	DWORD [52+eax],ecx
-	mov	eax,DWORD [28+esi]
-	; saved r[13]
-	; ################## Calculate word 14
-	xor	ecx,ecx
-	; mul a[7]*b[7]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebx,edx
-	adc	ecx,0
-	mov	DWORD [56+eax],ebp
-	; saved r[14]
-	; save r[15]
-	mov	DWORD [60+eax],ebx
-	pop	ebx
-	pop	ebp
-	pop	edi
-	pop	esi
-	ret
-global	_bn_mul_comba4
-align	16
-_bn_mul_comba4:
-L$_bn_mul_comba4_begin:
-	push	esi
-	mov	esi,DWORD [12+esp]
-	push	edi
-	mov	edi,DWORD [20+esp]
-	push	ebp
-	push	ebx
-	xor	ebx,ebx
-	mov	eax,DWORD [esi]
-	xor	ecx,ecx
-	mov	edx,DWORD [edi]
-	; ################## Calculate word 0
-	xor	ebp,ebp
-	; mul a[0]*b[0]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ecx,edx
-	mov	edx,DWORD [edi]
-	adc	ebp,0
-	mov	DWORD [eax],ebx
-	mov	eax,DWORD [4+esi]
-	; saved r[0]
-	; ################## Calculate word 1
-	xor	ebx,ebx
-	; mul a[1]*b[0]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [esi]
-	adc	ebp,edx
-	mov	edx,DWORD [4+edi]
-	adc	ebx,0
-	; mul a[0]*b[1]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebp,edx
-	mov	edx,DWORD [edi]
-	adc	ebx,0
-	mov	DWORD [4+eax],ecx
-	mov	eax,DWORD [8+esi]
-	; saved r[1]
-	; ################## Calculate word 2
-	xor	ecx,ecx
-	; mul a[2]*b[0]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [4+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [4+edi]
-	adc	ecx,0
-	; mul a[1]*b[1]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [esi]
-	adc	ebx,edx
-	mov	edx,DWORD [8+edi]
-	adc	ecx,0
-	; mul a[0]*b[2]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebx,edx
-	mov	edx,DWORD [edi]
-	adc	ecx,0
-	mov	DWORD [8+eax],ebp
-	mov	eax,DWORD [12+esi]
-	; saved r[2]
-	; ################## Calculate word 3
-	xor	ebp,ebp
-	; mul a[3]*b[0]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [8+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [4+edi]
-	adc	ebp,0
-	; mul a[2]*b[1]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [4+esi]
-	adc	ecx,edx
-	mov	edx,DWORD [8+edi]
-	adc	ebp,0
-	; mul a[1]*b[2]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [esi]
-	adc	ecx,edx
-	mov	edx,DWORD [12+edi]
-	adc	ebp,0
-	; mul a[0]*b[3]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ecx,edx
-	mov	edx,DWORD [4+edi]
-	adc	ebp,0
-	mov	DWORD [12+eax],ebx
-	mov	eax,DWORD [12+esi]
-	; saved r[3]
-	; ################## Calculate word 4
-	xor	ebx,ebx
-	; mul a[3]*b[1]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [8+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [8+edi]
-	adc	ebx,0
-	; mul a[2]*b[2]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [4+esi]
-	adc	ebp,edx
-	mov	edx,DWORD [12+edi]
-	adc	ebx,0
-	; mul a[1]*b[3]
-	mul	edx
-	add	ecx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebp,edx
-	mov	edx,DWORD [8+edi]
-	adc	ebx,0
-	mov	DWORD [16+eax],ecx
-	mov	eax,DWORD [12+esi]
-	; saved r[4]
-	; ################## Calculate word 5
-	xor	ecx,ecx
-	; mul a[3]*b[2]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [8+esi]
-	adc	ebx,edx
-	mov	edx,DWORD [12+edi]
-	adc	ecx,0
-	; mul a[2]*b[3]
-	mul	edx
-	add	ebp,eax
-	mov	eax,DWORD [20+esp]
-	adc	ebx,edx
-	mov	edx,DWORD [12+edi]
-	adc	ecx,0
-	mov	DWORD [20+eax],ebp
-	mov	eax,DWORD [12+esi]
-	; saved r[5]
-	; ################## Calculate word 6
-	xor	ebp,ebp
-	; mul a[3]*b[3]
-	mul	edx
-	add	ebx,eax
-	mov	eax,DWORD [20+esp]
-	adc	ecx,edx
-	adc	ebp,0
-	mov	DWORD [24+eax],ebx
-	; saved r[6]
-	; save r[7]
-	mov	DWORD [28+eax],ecx
-	pop	ebx
-	pop	ebp
-	pop	edi
-	pop	esi
-	ret
-global	_bn_sqr_comba8
-align	16
-_bn_sqr_comba8:
-L$_bn_sqr_comba8_begin:
-	push	esi
-	push	edi
-	push	ebp
-	push	ebx
-	mov	edi,DWORD [20+esp]
-	mov	esi,DWORD [24+esp]
-	xor	ebx,ebx
-	xor	ecx,ecx
-	mov	eax,DWORD [esi]
-	; ############### Calculate word 0
-	xor	ebp,ebp
-	; sqr a[0]*a[0]
-	mul	eax
-	add	ebx,eax
-	adc	ecx,edx
-	mov	edx,DWORD [esi]
-	adc	ebp,0
-	mov	DWORD [edi],ebx
-	mov	eax,DWORD [4+esi]
-	; saved r[0]
-	; ############### Calculate word 1
-	xor	ebx,ebx
-	; sqr a[1]*a[0]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [8+esi]
-	adc	ebx,0
-	mov	DWORD [4+edi],ecx
-	mov	edx,DWORD [esi]
-	; saved r[1]
-	; ############### Calculate word 2
-	xor	ecx,ecx
-	; sqr a[2]*a[0]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [4+esi]
-	adc	ecx,0
-	; sqr a[1]*a[1]
-	mul	eax
-	add	ebp,eax
-	adc	ebx,edx
-	mov	edx,DWORD [esi]
-	adc	ecx,0
-	mov	DWORD [8+edi],ebp
-	mov	eax,DWORD [12+esi]
-	; saved r[2]
-	; ############### Calculate word 3
-	xor	ebp,ebp
-	; sqr a[3]*a[0]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [8+esi]
-	adc	ebp,0
-	mov	edx,DWORD [4+esi]
-	; sqr a[2]*a[1]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [16+esi]
-	adc	ebp,0
-	mov	DWORD [12+edi],ebx
-	mov	edx,DWORD [esi]
-	; saved r[3]
-	; ############### Calculate word 4
-	xor	ebx,ebx
-	; sqr a[4]*a[0]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [12+esi]
-	adc	ebx,0
-	mov	edx,DWORD [4+esi]
-	; sqr a[3]*a[1]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [8+esi]
-	adc	ebx,0
-	; sqr a[2]*a[2]
-	mul	eax
-	add	ecx,eax
-	adc	ebp,edx
-	mov	edx,DWORD [esi]
-	adc	ebx,0
-	mov	DWORD [16+edi],ecx
-	mov	eax,DWORD [20+esi]
-	; saved r[4]
-	; ############### Calculate word 5
-	xor	ecx,ecx
-	; sqr a[5]*a[0]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [16+esi]
-	adc	ecx,0
-	mov	edx,DWORD [4+esi]
-	; sqr a[4]*a[1]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [12+esi]
-	adc	ecx,0
-	mov	edx,DWORD [8+esi]
-	; sqr a[3]*a[2]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [24+esi]
-	adc	ecx,0
-	mov	DWORD [20+edi],ebp
-	mov	edx,DWORD [esi]
-	; saved r[5]
-	; ############### Calculate word 6
-	xor	ebp,ebp
-	; sqr a[6]*a[0]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [20+esi]
-	adc	ebp,0
-	mov	edx,DWORD [4+esi]
-	; sqr a[5]*a[1]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [16+esi]
-	adc	ebp,0
-	mov	edx,DWORD [8+esi]
-	; sqr a[4]*a[2]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [12+esi]
-	adc	ebp,0
-	; sqr a[3]*a[3]
-	mul	eax
-	add	ebx,eax
-	adc	ecx,edx
-	mov	edx,DWORD [esi]
-	adc	ebp,0
-	mov	DWORD [24+edi],ebx
-	mov	eax,DWORD [28+esi]
-	; saved r[6]
-	; ############### Calculate word 7
-	xor	ebx,ebx
-	; sqr a[7]*a[0]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [24+esi]
-	adc	ebx,0
-	mov	edx,DWORD [4+esi]
-	; sqr a[6]*a[1]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [20+esi]
-	adc	ebx,0
-	mov	edx,DWORD [8+esi]
-	; sqr a[5]*a[2]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [16+esi]
-	adc	ebx,0
-	mov	edx,DWORD [12+esi]
-	; sqr a[4]*a[3]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [28+esi]
-	adc	ebx,0
-	mov	DWORD [28+edi],ecx
-	mov	edx,DWORD [4+esi]
-	; saved r[7]
-	; ############### Calculate word 8
-	xor	ecx,ecx
-	; sqr a[7]*a[1]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [24+esi]
-	adc	ecx,0
-	mov	edx,DWORD [8+esi]
-	; sqr a[6]*a[2]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [20+esi]
-	adc	ecx,0
-	mov	edx,DWORD [12+esi]
-	; sqr a[5]*a[3]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [16+esi]
-	adc	ecx,0
-	; sqr a[4]*a[4]
-	mul	eax
-	add	ebp,eax
-	adc	ebx,edx
-	mov	edx,DWORD [8+esi]
-	adc	ecx,0
-	mov	DWORD [32+edi],ebp
-	mov	eax,DWORD [28+esi]
-	; saved r[8]
-	; ############### Calculate word 9
-	xor	ebp,ebp
-	; sqr a[7]*a[2]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [24+esi]
-	adc	ebp,0
-	mov	edx,DWORD [12+esi]
-	; sqr a[6]*a[3]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [20+esi]
-	adc	ebp,0
-	mov	edx,DWORD [16+esi]
-	; sqr a[5]*a[4]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [28+esi]
-	adc	ebp,0
-	mov	DWORD [36+edi],ebx
-	mov	edx,DWORD [12+esi]
-	; saved r[9]
-	; ############### Calculate word 10
-	xor	ebx,ebx
-	; sqr a[7]*a[3]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [24+esi]
-	adc	ebx,0
-	mov	edx,DWORD [16+esi]
-	; sqr a[6]*a[4]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [20+esi]
-	adc	ebx,0
-	; sqr a[5]*a[5]
-	mul	eax
-	add	ecx,eax
-	adc	ebp,edx
-	mov	edx,DWORD [16+esi]
-	adc	ebx,0
-	mov	DWORD [40+edi],ecx
-	mov	eax,DWORD [28+esi]
-	; saved r[10]
-	; ############### Calculate word 11
-	xor	ecx,ecx
-	; sqr a[7]*a[4]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [24+esi]
-	adc	ecx,0
-	mov	edx,DWORD [20+esi]
-	; sqr a[6]*a[5]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [28+esi]
-	adc	ecx,0
-	mov	DWORD [44+edi],ebp
-	mov	edx,DWORD [20+esi]
-	; saved r[11]
-	; ############### Calculate word 12
-	xor	ebp,ebp
-	; sqr a[7]*a[5]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [24+esi]
-	adc	ebp,0
-	; sqr a[6]*a[6]
-	mul	eax
-	add	ebx,eax
-	adc	ecx,edx
-	mov	edx,DWORD [24+esi]
-	adc	ebp,0
-	mov	DWORD [48+edi],ebx
-	mov	eax,DWORD [28+esi]
-	; saved r[12]
-	; ############### Calculate word 13
-	xor	ebx,ebx
-	; sqr a[7]*a[6]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [28+esi]
-	adc	ebx,0
-	mov	DWORD [52+edi],ecx
-	; saved r[13]
-	; ############### Calculate word 14
-	xor	ecx,ecx
-	; sqr a[7]*a[7]
-	mul	eax
-	add	ebp,eax
-	adc	ebx,edx
-	adc	ecx,0
-	mov	DWORD [56+edi],ebp
-	; saved r[14]
-	mov	DWORD [60+edi],ebx
-	pop	ebx
-	pop	ebp
-	pop	edi
-	pop	esi
-	ret
-global	_bn_sqr_comba4
-align	16
-_bn_sqr_comba4:
-L$_bn_sqr_comba4_begin:
-	push	esi
-	push	edi
-	push	ebp
-	push	ebx
-	mov	edi,DWORD [20+esp]
-	mov	esi,DWORD [24+esp]
-	xor	ebx,ebx
-	xor	ecx,ecx
-	mov	eax,DWORD [esi]
-	; ############### Calculate word 0
-	xor	ebp,ebp
-	; sqr a[0]*a[0]
-	mul	eax
-	add	ebx,eax
-	adc	ecx,edx
-	mov	edx,DWORD [esi]
-	adc	ebp,0
-	mov	DWORD [edi],ebx
-	mov	eax,DWORD [4+esi]
-	; saved r[0]
-	; ############### Calculate word 1
-	xor	ebx,ebx
-	; sqr a[1]*a[0]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [8+esi]
-	adc	ebx,0
-	mov	DWORD [4+edi],ecx
-	mov	edx,DWORD [esi]
-	; saved r[1]
-	; ############### Calculate word 2
-	xor	ecx,ecx
-	; sqr a[2]*a[0]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [4+esi]
-	adc	ecx,0
-	; sqr a[1]*a[1]
-	mul	eax
-	add	ebp,eax
-	adc	ebx,edx
-	mov	edx,DWORD [esi]
-	adc	ecx,0
-	mov	DWORD [8+edi],ebp
-	mov	eax,DWORD [12+esi]
-	; saved r[2]
-	; ############### Calculate word 3
-	xor	ebp,ebp
-	; sqr a[3]*a[0]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [8+esi]
-	adc	ebp,0
-	mov	edx,DWORD [4+esi]
-	; sqr a[2]*a[1]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebp,0
-	add	ebx,eax
-	adc	ecx,edx
-	mov	eax,DWORD [12+esi]
-	adc	ebp,0
-	mov	DWORD [12+edi],ebx
-	mov	edx,DWORD [4+esi]
-	; saved r[3]
-	; ############### Calculate word 4
-	xor	ebx,ebx
-	; sqr a[3]*a[1]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ebx,0
-	add	ecx,eax
-	adc	ebp,edx
-	mov	eax,DWORD [8+esi]
-	adc	ebx,0
-	; sqr a[2]*a[2]
-	mul	eax
-	add	ecx,eax
-	adc	ebp,edx
-	mov	edx,DWORD [8+esi]
-	adc	ebx,0
-	mov	DWORD [16+edi],ecx
-	mov	eax,DWORD [12+esi]
-	; saved r[4]
-	; ############### Calculate word 5
-	xor	ecx,ecx
-	; sqr a[3]*a[2]
-	mul	edx
-	add	eax,eax
-	adc	edx,edx
-	adc	ecx,0
-	add	ebp,eax
-	adc	ebx,edx
-	mov	eax,DWORD [12+esi]
-	adc	ecx,0
-	mov	DWORD [20+edi],ebp
-	; saved r[5]
-	; ############### Calculate word 6
-	xor	ebp,ebp
-	; sqr a[3]*a[3]
-	mul	eax
-	add	ebx,eax
-	adc	ecx,edx
-	adc	ebp,0
-	mov	DWORD [24+edi],ebx
-	; saved r[6]
-	mov	DWORD [28+edi],ecx
-	pop	ebx
-	pop	ebp
-	pop	edi
-	pop	esi
-	ret
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/check.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/check.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/check.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/check.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/dh.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/dh.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/dh.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/dh/dh.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digest.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digest.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digest.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digest.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c.inc
similarity index 75%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c.inc
index e685ac58..817f454d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/digest/digests.c.inc
@@ -59,8 +59,6 @@
 #include <assert.h>
 #include <string.h>
 
-#include <CCryptoBoringSSL_md4.h>
-#include <CCryptoBoringSSL_md5.h>
 #include <CCryptoBoringSSL_nid.h>
 #include <CCryptoBoringSSL_sha.h>
 
@@ -75,69 +73,21 @@
 #endif
 
 
-static void md4_init(EVP_MD_CTX *ctx) {
-  CHECK(MD4_Init(ctx->md_data));
-}
-
-static void md4_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
-  CHECK(MD4_Update(ctx->md_data, data, count));
-}
-
-static void md4_final(EVP_MD_CTX *ctx, uint8_t *out) {
-  CHECK(MD4_Final(out, ctx->md_data));
-}
-
-DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md4) {
-  out->type = NID_md4;
-  out->md_size = MD4_DIGEST_LENGTH;
-  out->flags = 0;
-  out->init = md4_init;
-  out->update = md4_update;
-  out->final = md4_final;
-  out->block_size = 64;
-  out->ctx_size = sizeof(MD4_CTX);
-}
-
-
-static void md5_init(EVP_MD_CTX *ctx) {
-  CHECK(MD5_Init(ctx->md_data));
-}
-
-static void md5_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
-  CHECK(MD5_Update(ctx->md_data, data, count));
-}
-
-static void md5_final(EVP_MD_CTX *ctx, uint8_t *out) {
-  CHECK(MD5_Final(out, ctx->md_data));
-}
-
-DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md5) {
-  out->type = NID_md5;
-  out->md_size = MD5_DIGEST_LENGTH;
-  out->flags = 0;
-  out->init = md5_init;
-  out->update = md5_update;
-  out->final = md5_final;
-  out->block_size = 64;
-  out->ctx_size = sizeof(MD5_CTX);
-}
-
-
 static void sha1_init(EVP_MD_CTX *ctx) {
-  CHECK(SHA1_Init(ctx->md_data));
+  BCM_sha1_init(ctx->md_data);
 }
 
 static void sha1_update(EVP_MD_CTX *ctx, const void *data, size_t count) {
-  CHECK(SHA1_Update(ctx->md_data, data, count));
+  BCM_sha1_update(ctx->md_data, data, count);
 }
 
 static void sha1_final(EVP_MD_CTX *ctx, uint8_t *md) {
-  CHECK(SHA1_Final(md, ctx->md_data));
+  BCM_sha1_final(md, ctx->md_data);
 }
 
 DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha1) {
   out->type = NID_sha1;
-  out->md_size = SHA_DIGEST_LENGTH;
+  out->md_size = BCM_SHA_DIGEST_LENGTH;
   out->flags = 0;
   out->init = sha1_init;
   out->update = sha1_update;
@@ -266,39 +216,4 @@ DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha512_256) {
   out->ctx_size = sizeof(SHA512_CTX);
 }
 
-
-typedef struct {
-  MD5_CTX md5;
-  SHA_CTX sha1;
-} MD5_SHA1_CTX;
-
-static void md5_sha1_init(EVP_MD_CTX *md_ctx) {
-  MD5_SHA1_CTX *ctx = md_ctx->md_data;
-  CHECK(MD5_Init(&ctx->md5) && SHA1_Init(&ctx->sha1));
-}
-
-static void md5_sha1_update(EVP_MD_CTX *md_ctx, const void *data,
-                            size_t count) {
-  MD5_SHA1_CTX *ctx = md_ctx->md_data;
-  CHECK(MD5_Update(&ctx->md5, data, count) &&
-        SHA1_Update(&ctx->sha1, data, count));
-}
-
-static void md5_sha1_final(EVP_MD_CTX *md_ctx, uint8_t *out) {
-  MD5_SHA1_CTX *ctx = md_ctx->md_data;
-  CHECK(MD5_Final(out, &ctx->md5) &&
-        SHA1_Final(out + MD5_DIGEST_LENGTH, &ctx->sha1));
-}
-
-DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md5_sha1) {
-  out->type = NID_md5_sha1;
-  out->md_size = MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH;
-  out->flags = 0;
-  out->init = md5_sha1_init;
-  out->update = md5_sha1_update;
-  out->final = md5_sha1_final;
-  out->block_size = 64;
-  out->ctx_size = sizeof(MD5_SHA1_CTX);
-}
-
 #undef CHECK
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/digestsign/digestsign.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/digestsign/digestsign.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/digestsign/digestsign.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/digestsign/digestsign.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c.inc
similarity index 95%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c.inc
index 57306177..249588fb 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_key.c.inc
@@ -75,10 +75,12 @@
 #include <CCryptoBoringSSL_err.h>
 #include <CCryptoBoringSSL_ex_data.h>
 #include <CCryptoBoringSSL_mem.h>
+#include <CCryptoBoringSSL_sha.h>
 #include <CCryptoBoringSSL_thread.h>
 
 #include "internal.h"
 #include "../delocate.h"
+#include "../ecdsa/internal.h"
 #include "../service_indicator/internal.h"
 #include "../../internal.h"
 
@@ -242,7 +244,10 @@ int EC_KEY_set_private_key(EC_KEY *key, const BIGNUM *priv_key) {
     return 0;
   }
   if (!ec_bignum_to_scalar(key->group, &scalar->scalar, priv_key) ||
-      ec_scalar_is_zero(key->group, &scalar->scalar)) {
+      // Zero is not a valid private key, so it is safe to leak the result of
+      // this comparison.
+      constant_time_declassify_int(
+          ec_scalar_is_zero(key->group, &scalar->scalar))) {
     OPENSSL_PUT_ERROR(EC, EC_R_INVALID_PRIVATE_KEY);
     ec_wrapped_scalar_free(scalar);
     return 0;
@@ -341,15 +346,17 @@ int EC_KEY_check_fips(const EC_KEY *key) {
   }
 
   if (key->priv_key) {
-    uint8_t data[16] = {0};
-    ECDSA_SIG *sig = ECDSA_do_sign(data, sizeof(data), key);
+    uint8_t digest[SHA256_DIGEST_LENGTH] = {0};
+    uint8_t sig[ECDSA_MAX_FIXED_LEN];
+    size_t sig_len;
+    if (!ecdsa_sign_fixed(digest, sizeof(digest), sig, &sig_len, sizeof(sig),
+                          key)) {
+      goto end;
+    }
     if (boringssl_fips_break_test("ECDSA_PWCT")) {
-      data[0] = ~data[0];
+      digest[0] = ~digest[0];
     }
-    int ok = sig != NULL &&
-             ECDSA_do_verify(data, sizeof(data), sig, key);
-    ECDSA_SIG_free(sig);
-    if (!ok) {
+    if (!ecdsa_verify_fixed(digest, sizeof(digest), sig, sig_len, key)) {
       OPENSSL_PUT_ERROR(EC, EC_R_PUBLIC_KEY_VALIDATION_FAILED);
       goto end;
     }
@@ -518,6 +525,11 @@ int EC_KEY_generate_key(EC_KEY *key) {
 }
 
 int EC_KEY_generate_key_fips(EC_KEY *eckey) {
+  if (eckey == NULL || eckey->group == NULL) {
+    OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER);
+    return 0;
+  }
+
   boringssl_ensure_ecc_self_test();
 
   if (EC_KEY_generate_key(eckey) && EC_KEY_check_fips(eckey)) {
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_montgomery.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_montgomery.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_montgomery.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/ec_montgomery.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/felem.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/felem.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/felem.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/felem.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/oct.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/oct.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/oct.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/oct.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p224-64.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p224-64.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p224-64.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p224-64.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c.inc
similarity index 85%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c.inc
index 7d63708a..00d65264 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.c.inc
@@ -39,7 +39,7 @@
 typedef P256_POINT_AFFINE PRECOMP256_ROW[64];
 
 // One converted into the Montgomery domain
-static const BN_ULONG ONE[P256_LIMBS] = {
+static const BN_ULONG ONE_MONT[P256_LIMBS] = {
     TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000),
     TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe),
 };
@@ -116,6 +116,103 @@ static BN_ULONG is_not_zero(BN_ULONG in) {
   return in;
 }
 
+#if defined(OPENSSL_X86_64)
+// Dispatch between CPU variations. The "_adx" suffixed functions use MULX in
+// addition to ADCX/ADOX. MULX is part of BMI2, not ADX, so we must check both
+// capabilities.
+static void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
+                                  const BN_ULONG a[P256_LIMBS],
+                                  const BN_ULONG b[P256_LIMBS]) {
+  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
+    ecp_nistz256_mul_mont_adx(res, a, b);
+  } else {
+    ecp_nistz256_mul_mont_nohw(res, a, b);
+  }
+}
+
+static void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
+                                  const BN_ULONG a[P256_LIMBS]) {
+  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
+    ecp_nistz256_sqr_mont_adx(res, a);
+  } else {
+    ecp_nistz256_sqr_mont_nohw(res, a);
+  }
+}
+
+static void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+                                      const BN_ULONG a[P256_LIMBS],
+                                      const BN_ULONG b[P256_LIMBS]) {
+  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
+    ecp_nistz256_ord_mul_mont_adx(res, a, b);
+  } else {
+    ecp_nistz256_ord_mul_mont_nohw(res, a, b);
+  }
+}
+
+static void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+                                      const BN_ULONG a[P256_LIMBS],
+                                      BN_ULONG rep) {
+  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
+    ecp_nistz256_ord_sqr_mont_adx(res, a, rep);
+  } else {
+    ecp_nistz256_ord_sqr_mont_nohw(res, a, rep);
+  }
+}
+
+static void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16],
+                                   int index) {
+  if (CRYPTO_is_AVX2_capable()) {
+    ecp_nistz256_select_w5_avx2(val, in_t, index);
+  } else {
+    ecp_nistz256_select_w5_nohw(val, in_t, index);
+  }
+}
+
+static void ecp_nistz256_select_w7(P256_POINT_AFFINE *val,
+                                   const P256_POINT_AFFINE in_t[64],
+                                   int index) {
+  if (CRYPTO_is_AVX2_capable()) {
+    ecp_nistz256_select_w7_avx2(val, in_t, index);
+  } else {
+    ecp_nistz256_select_w7_nohw(val, in_t, index);
+  }
+}
+
+static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) {
+  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
+    ecp_nistz256_point_double_adx(r, a);
+  } else {
+    ecp_nistz256_point_double_nohw(r, a);
+  }
+}
+
+static void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
+                                   const P256_POINT *b) {
+  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
+    ecp_nistz256_point_add_adx(r, a, b);
+  } else {
+    ecp_nistz256_point_add_nohw(r, a, b);
+  }
+}
+
+static void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
+                                          const P256_POINT_AFFINE *b) {
+  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
+    ecp_nistz256_point_add_affine_adx(r, a, b);
+  } else {
+    ecp_nistz256_point_add_affine_nohw(r, a, b);
+  }
+}
+#endif  // OPENSSL_X86_64
+
+// ecp_nistz256_from_mont sets |res| to |in|, converted from Montgomery domain
+// by multiplying with 1.
+static void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
+                                   const BN_ULONG in[P256_LIMBS]) {
+  static const BN_ULONG ONE[P256_LIMBS] = {1};
+  ecp_nistz256_mul_mont(res, in, ONE);
+}
+
 // ecp_nistz256_mod_inverse_sqr_mont sets |r| to (|in| * 2^-256)^-2 * 2^256 mod
 // p. That is, |r| is the modular inverse square of |in| for input and output in
 // the Montgomery domain.
@@ -328,12 +425,12 @@ static void ecp_nistz256_point_mul_base(const EC_GROUP *group, EC_JACOBIAN *r,
   copy_conditional(t.Y, p.Z, wvalue & 1);
 
   // Convert |t| from affine to Jacobian coordinates. We set Z to zero if |t|
-  // is infinity and |ONE| otherwise. |t| was computed from the table, so it
-  // is infinity iff |wvalue >> 1| is zero.
+  // is infinity and |ONE_MONT| otherwise. |t| was computed from the table, so
+  // it is infinity iff |wvalue >> 1| is zero.
   OPENSSL_memcpy(p.X, t.X, sizeof(p.X));
   OPENSSL_memcpy(p.Y, t.Y, sizeof(p.Y));
   OPENSSL_memset(p.Z, 0, sizeof(p.Z));
-  copy_conditional(p.Z, ONE, is_not_zero(wvalue >> 1));
+  copy_conditional(p.Z, ONE_MONT, is_not_zero(wvalue >> 1));
 
   for (int i = 1; i < 37; i++) {
     wvalue = calc_wvalue(&index, p_str);
@@ -372,14 +469,14 @@ static void ecp_nistz256_points_mul_public(const EC_GROUP *group,
   size_t wvalue = calc_first_wvalue(&index, p_str);
 
   // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p|
-  // is infinity and |ONE| otherwise. |p| was computed from the table, so it
-  // is infinity iff |wvalue >> 1| is zero.
+  // is infinity and |ONE_MONT| otherwise. |p| was computed from the table, so
+  // it is infinity iff |wvalue >> 1| is zero.
   if ((wvalue >> 1) != 0) {
     OPENSSL_memcpy(p.X, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].X,
                    sizeof(p.X));
     OPENSSL_memcpy(p.Y, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].Y,
                    sizeof(p.Y));
-    OPENSSL_memcpy(p.Z, ONE, sizeof(p.Z));
+    OPENSSL_memcpy(p.Z, ONE_MONT, sizeof(p.Z));
   } else {
     OPENSSL_memset(p.X, 0, sizeof(p.X));
     OPENSSL_memset(p.Y, 0, sizeof(p.Y));
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.h
index 985f8e25..cbf1e056 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.h
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256-nistz.h
@@ -48,21 +48,29 @@ extern "C" {
 void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
 
 // ecp_nistz256_mul_mont sets |res| to |a| * |b| * 2^-256 mod P.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_mul_mont_nohw(BN_ULONG res[P256_LIMBS],
+                                const BN_ULONG a[P256_LIMBS],
+                                const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_mul_mont_adx(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS],
+                               const BN_ULONG b[P256_LIMBS]);
+#else
 void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
                            const BN_ULONG a[P256_LIMBS],
                            const BN_ULONG b[P256_LIMBS]);
+#endif
 
 // ecp_nistz256_sqr_mont sets |res| to |a| * |a| * 2^-256 mod P.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_sqr_mont_nohw(BN_ULONG res[P256_LIMBS],
+                                const BN_ULONG a[P256_LIMBS]);
+void ecp_nistz256_sqr_mont_adx(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS]);
+#else
 void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
                            const BN_ULONG a[P256_LIMBS]);
-
-// ecp_nistz256_from_mont sets |res| to |in|, converted from Montgomery domain
-// by multiplying with 1.
-static inline void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
-                                          const BN_ULONG in[P256_LIMBS]) {
-  static const BN_ULONG ONE[P256_LIMBS] = { 1 };
-  ecp_nistz256_mul_mont(res, in, ONE);
-}
+#endif
 
 
 // P-256 scalar operations.
@@ -72,15 +80,31 @@ static inline void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
 
 // ecp_nistz256_ord_mul_mont sets |res| to |a| * |b| where inputs and outputs
 // are in Montgomery form. That is, |res| is |a| * |b| * 2^-256 mod N.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_ord_mul_mont_nohw(BN_ULONG res[P256_LIMBS],
+                                    const BN_ULONG a[P256_LIMBS],
+                                    const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_ord_mul_mont_adx(BN_ULONG res[P256_LIMBS],
+                                   const BN_ULONG a[P256_LIMBS],
+                                   const BN_ULONG b[P256_LIMBS]);
+#else
 void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
                                const BN_ULONG a[P256_LIMBS],
                                const BN_ULONG b[P256_LIMBS]);
+#endif
 
 // ecp_nistz256_ord_sqr_mont sets |res| to |a|^(2*|rep|) where inputs and
 // outputs are in Montgomery form. That is, |res| is
 // (|a| * 2^-256)^(2*|rep|) * 2^256 mod N.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_ord_sqr_mont_nohw(BN_ULONG res[P256_LIMBS],
+                                    const BN_ULONG a[P256_LIMBS], BN_ULONG rep);
+void ecp_nistz256_ord_sqr_mont_adx(BN_ULONG res[P256_LIMBS],
+                                   const BN_ULONG a[P256_LIMBS], BN_ULONG rep);
+#else
 void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
                                const BN_ULONG a[P256_LIMBS], BN_ULONG rep);
+#endif
 
 // beeu_mod_inverse_vartime sets out = a^-1 mod p using a Euclidean algorithm.
 // Assumption: 0 < a < p < 2^(256) and p is odd.
@@ -111,27 +135,60 @@ typedef struct {
 // ecp_nistz256_select_w5 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 16
 // and all zeros (the point at infinity) if |index| is 0. This is done in
 // constant time.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_select_w5_nohw(P256_POINT *val, const P256_POINT in_t[16],
+                                 int index);
+void ecp_nistz256_select_w5_avx2(P256_POINT *val, const P256_POINT in_t[16],
+                                 int index);
+#else
 void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16],
                             int index);
+#endif
 
 // ecp_nistz256_select_w7 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 64
 // and all zeros (the point at infinity) if |index| is 0. This is done in
 // constant time.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_select_w7_nohw(P256_POINT_AFFINE *val,
+                                 const P256_POINT_AFFINE in_t[64], int index);
+void ecp_nistz256_select_w7_avx2(P256_POINT_AFFINE *val,
+                                 const P256_POINT_AFFINE in_t[64], int index);
+#else
 void ecp_nistz256_select_w7(P256_POINT_AFFINE *val,
                             const P256_POINT_AFFINE in_t[64], int index);
+#endif
 
 // ecp_nistz256_point_double sets |r| to |a| doubled.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_point_double_nohw(P256_POINT *r, const P256_POINT *a);
+void ecp_nistz256_point_double_adx(P256_POINT *r, const P256_POINT *a);
+#else
 void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a);
+#endif
 
 // ecp_nistz256_point_add adds |a| to |b| and places the result in |r|.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_point_add_nohw(P256_POINT *r, const P256_POINT *a,
+                                 const P256_POINT *b);
+void ecp_nistz256_point_add_adx(P256_POINT *r, const P256_POINT *a,
+                                const P256_POINT *b);
+#else
 void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
                             const P256_POINT *b);
+#endif
 
 // ecp_nistz256_point_add_affine adds |a| to |b| and places the result in
 // |r|. |a| and |b| must not represent the same point unless they are both
 // infinity.
+#if defined(OPENSSL_X86_64)
+void ecp_nistz256_point_add_affine_adx(P256_POINT *r, const P256_POINT *a,
+                                       const P256_POINT_AFFINE *b);
+void ecp_nistz256_point_add_affine_nohw(P256_POINT *r, const P256_POINT *a,
+                                        const P256_POINT_AFFINE *b);
+#else
 void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
                                    const P256_POINT_AFFINE *b);
+#endif
 
 #endif /* !defined(OPENSSL_NO_ASM) && \
           (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&   \
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/p256.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c.inc
similarity index 95%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c.inc
index 0850c077..7721b488 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/scalar.c.inc
@@ -23,8 +23,12 @@
 
 int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
                         const BIGNUM *in) {
+  // Scalars, which are often secret, must be reduced modulo the order. Those
+  // that are not will be discarded, so leaking the result of the comparison is
+  // safe.
   if (!bn_copy_words(out->words, group->order.N.width, in) ||
-      !bn_less_than_words(out->words, group->order.N.d, group->order.N.width)) {
+      !constant_time_declassify_int(bn_less_than_words(
+          out->words, group->order.N.d, group->order.N.width))) {
     OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
     return 0;
   }
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple_mul.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple_mul.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple_mul.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/simple_mul.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/util.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/util.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/util.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/util.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/wnaf.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/wnaf.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/wnaf.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ec/wnaf.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdh/ecdh.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdh/ecdh.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdh/ecdh.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdh/ecdh.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c.inc
similarity index 74%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c.inc
index 053235bc..5065f2a8 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/ecdsa.c.inc
@@ -95,61 +95,9 @@ static void digest_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
                           order->width);
 }
 
-ECDSA_SIG *ECDSA_SIG_new(void) {
-  ECDSA_SIG *sig = OPENSSL_malloc(sizeof(ECDSA_SIG));
-  if (sig == NULL) {
-    return NULL;
-  }
-  sig->r = BN_new();
-  sig->s = BN_new();
-  if (sig->r == NULL || sig->s == NULL) {
-    ECDSA_SIG_free(sig);
-    return NULL;
-  }
-  return sig;
-}
-
-void ECDSA_SIG_free(ECDSA_SIG *sig) {
-  if (sig == NULL) {
-    return;
-  }
-
-  BN_free(sig->r);
-  BN_free(sig->s);
-  OPENSSL_free(sig);
-}
-
-const BIGNUM *ECDSA_SIG_get0_r(const ECDSA_SIG *sig) {
-  return sig->r;
-}
-
-const BIGNUM *ECDSA_SIG_get0_s(const ECDSA_SIG *sig) {
-  return sig->s;
-}
-
-void ECDSA_SIG_get0(const ECDSA_SIG *sig, const BIGNUM **out_r,
-                    const BIGNUM **out_s) {
-  if (out_r != NULL) {
-    *out_r = sig->r;
-  }
-  if (out_s != NULL) {
-    *out_s = sig->s;
-  }
-}
-
-int ECDSA_SIG_set0(ECDSA_SIG *sig, BIGNUM *r, BIGNUM *s) {
-  if (r == NULL || s == NULL) {
-    return 0;
-  }
-  BN_free(sig->r);
-  BN_free(sig->s);
-  sig->r = r;
-  sig->s = s;
-  return 1;
-}
-
-int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len,
-                                 const ECDSA_SIG *sig, const EC_KEY *eckey) {
+int ecdsa_verify_fixed_no_self_test(const uint8_t *digest, size_t digest_len,
+                                    const uint8_t *sig, size_t sig_len,
+                                    const EC_KEY *eckey) {
   const EC_GROUP *group = EC_KEY_get0_group(eckey);
   const EC_POINT *pub_key = EC_KEY_get0_public_key(eckey);
   if (group == NULL || pub_key == NULL || sig == NULL) {
@@ -157,11 +105,13 @@ int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len,
     return 0;
   }
 
+  size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group));
   EC_SCALAR r, s, u1, u2, s_inv_mont, m;
-  if (BN_is_zero(sig->r) ||
-      !ec_bignum_to_scalar(group, &r, sig->r) ||
-      BN_is_zero(sig->s) ||
-      !ec_bignum_to_scalar(group, &s, sig->s)) {
+  if (sig_len != 2 * scalar_len ||
+      !ec_scalar_from_bytes(group, &r, sig, scalar_len) ||
+      ec_scalar_is_zero(group, &r) ||
+      !ec_scalar_from_bytes(group, &s, sig + scalar_len, scalar_len) ||
+      ec_scalar_is_zero(group, &s)) {
     OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
     return 0;
   }
@@ -195,24 +145,31 @@ int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len,
   return 1;
 }
 
-int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
-                    const ECDSA_SIG *sig, const EC_KEY *eckey) {
+int ecdsa_verify_fixed(const uint8_t *digest, size_t digest_len,
+                       const uint8_t *sig, size_t sig_len, const EC_KEY *key) {
   boringssl_ensure_ecc_self_test();
 
-  return ecdsa_do_verify_no_self_test(digest, digest_len, sig, eckey);
+  return ecdsa_verify_fixed_no_self_test(digest, digest_len, sig, sig_len, key);
 }
 
-static ECDSA_SIG *ecdsa_sign_impl(const EC_GROUP *group, int *out_retry,
-                                  const EC_SCALAR *priv_key, const EC_SCALAR *k,
-                                  const uint8_t *digest, size_t digest_len) {
+static int ecdsa_sign_impl(const EC_GROUP *group, int *out_retry, uint8_t *sig,
+                           size_t *out_sig_len, size_t max_sig_len,
+                           const EC_SCALAR *priv_key, const EC_SCALAR *k,
+                           const uint8_t *digest, size_t digest_len) {
   *out_retry = 0;
 
   // Check that the size of the group order is FIPS compliant (FIPS 186-4
   // B.5.2).
   const BIGNUM *order = EC_GROUP_get0_order(group);
   if (BN_num_bits(order) < 160) {
-    OPENSSL_PUT_ERROR(ECDSA, EC_R_INVALID_GROUP_ORDER);
-    return NULL;
+    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER);
+    return 0;
+  }
+
+  size_t sig_len = 2 * BN_num_bytes(order);
+  if (sig_len > max_sig_len) {
+    OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL);
+    return 0;
   }
 
   // Compute r, the x-coordinate of k * generator.
@@ -220,12 +177,12 @@ static ECDSA_SIG *ecdsa_sign_impl(const EC_GROUP *group, int *out_retry,
   EC_SCALAR r;
   if (!ec_point_mul_scalar_base(group, &tmp_point, k) ||
       !ec_get_x_coordinate_as_scalar(group, &r, &tmp_point)) {
-    return NULL;
+    return 0;
   }
 
   if (constant_time_declassify_int(ec_scalar_is_zero(group, &r))) {
     *out_retry = 1;
-    return NULL;
+    return 0;
   }
 
   // s = priv_key * r. Note if only one parameter is in the Montgomery domain,
@@ -252,71 +209,59 @@ static ECDSA_SIG *ecdsa_sign_impl(const EC_GROUP *group, int *out_retry,
   ec_scalar_mul_montgomery(group, &s, &s, &tmp);
   if (constant_time_declassify_int(ec_scalar_is_zero(group, &s))) {
     *out_retry = 1;
-    return NULL;
+    return 0;
   }
 
   CONSTTIME_DECLASSIFY(r.words, sizeof(r.words));
   CONSTTIME_DECLASSIFY(s.words, sizeof(r.words));
-  ECDSA_SIG *ret = ECDSA_SIG_new();
-  if (ret == NULL ||  //
-      !bn_set_words(ret->r, r.words, order->width) ||
-      !bn_set_words(ret->s, s.words, order->width)) {
-    ECDSA_SIG_free(ret);
-    return NULL;
-  }
-  return ret;
+  size_t len;
+  ec_scalar_to_bytes(group, sig, &len, &r);
+  assert(len == sig_len / 2);
+  ec_scalar_to_bytes(group, sig + len, &len, &s);
+  assert(len == sig_len / 2);
+  *out_sig_len = sig_len;
+  return 1;
 }
 
-ECDSA_SIG *ecdsa_sign_with_nonce_for_known_answer_test(const uint8_t *digest,
-                                                       size_t digest_len,
-                                                       const EC_KEY *eckey,
-                                                       const uint8_t *nonce,
-                                                       size_t nonce_len) {
+int ecdsa_sign_fixed_with_nonce_for_known_answer_test(
+    const uint8_t *digest, size_t digest_len, uint8_t *sig, size_t *out_sig_len,
+    size_t max_sig_len, const EC_KEY *eckey, const uint8_t *nonce,
+    size_t nonce_len) {
   if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) {
     OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_NOT_IMPLEMENTED);
-    return NULL;
+    return 0;
   }
 
   const EC_GROUP *group = EC_KEY_get0_group(eckey);
   if (group == NULL || eckey->priv_key == NULL) {
     OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER);
-    return NULL;
+    return 0;
   }
   const EC_SCALAR *priv_key = &eckey->priv_key->scalar;
 
   EC_SCALAR k;
   if (!ec_scalar_from_bytes(group, &k, nonce, nonce_len)) {
-    return NULL;
+    return 0;
   }
   int retry_ignored;
-  return ecdsa_sign_impl(group, &retry_ignored, priv_key, &k, digest,
-                         digest_len);
-}
-
-// This function is only exported for testing and is not called in production
-// code.
-ECDSA_SIG *ECDSA_sign_with_nonce_and_leak_private_key_for_testing(
-    const uint8_t *digest, size_t digest_len, const EC_KEY *eckey,
-    const uint8_t *nonce, size_t nonce_len) {
-  boringssl_ensure_ecc_self_test();
-
-  return ecdsa_sign_with_nonce_for_known_answer_test(digest, digest_len, eckey,
-                                                     nonce, nonce_len);
+  return ecdsa_sign_impl(group, &retry_ignored, sig, out_sig_len, max_sig_len,
+                         priv_key, &k, digest, digest_len);
 }
 
-ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len,
-                         const EC_KEY *eckey) {
+int ecdsa_sign_fixed(const uint8_t *digest, size_t digest_len, uint8_t *sig,
+                     size_t *out_sig_len, size_t max_sig_len,
+                     const EC_KEY *eckey) {
   boringssl_ensure_ecc_self_test();
 
   if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) {
     OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_NOT_IMPLEMENTED);
-    return NULL;
+    return 0;
   }
 
   const EC_GROUP *group = EC_KEY_get0_group(eckey);
   if (group == NULL || eckey->priv_key == NULL) {
     OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER);
-    return NULL;
+    return 0;
   }
   const BIGNUM *order = EC_GROUP_get0_order(group);
   const EC_SCALAR *priv_key = &eckey->priv_key->scalar;
@@ -340,12 +285,11 @@ ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len,
   // FIPS) because the probability of requiring even one retry is negligible,
   // let alone 32.
   static const int kMaxIterations = 32;
-  ECDSA_SIG *ret = NULL;
+  int ret = 0;
   int iters = 0;
   for (;;) {
     EC_SCALAR k;
     if (!ec_random_nonzero_scalar(group, &k, additional_data)) {
-      ret = NULL;
       goto out;
     }
 
@@ -354,8 +298,9 @@ ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len,
     CONSTTIME_SECRET(k.words, sizeof(k.words));
 
     int retry;
-    ret = ecdsa_sign_impl(group, &retry, priv_key, &k, digest, digest_len);
-    if (ret != NULL || !retry) {
+    ret = ecdsa_sign_impl(group, &retry, sig, out_sig_len, max_sig_len,
+                          priv_key, &k, digest, digest_len);
+    if (ret || !retry) {
       goto out;
     }
 
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/internal.h
index 0a8def26..519f6e18 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ecdsa/internal.h
@@ -17,25 +17,42 @@
 
 #include <CCryptoBoringSSL_base.h>
 
+#include "../ec/internal.h"
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
 
 
-// ecdsa_sign_with_nonce_for_known_answer_test behaves like |ECDSA_do_sign| but
-// takes a fixed nonce. This function is used as part of known-answer tests in
-// the FIPS module.
-ECDSA_SIG *ecdsa_sign_with_nonce_for_known_answer_test(const uint8_t *digest,
-                                                       size_t digest_len,
-                                                       const EC_KEY *eckey,
-                                                       const uint8_t *nonce,
-                                                       size_t nonce_len);
+// ECDSA_MAX_FIXED_LEN is the maximum length of an ECDSA signature in the
+// fixed-width, big-endian format from IEEE P1363.
+#define ECDSA_MAX_FIXED_LEN (2 * EC_MAX_BYTES)
+
+// ecdsa_sign_fixed behaves like |ECDSA_sign| but uses the fixed-width,
+// big-endian format from IEEE P1363.
+int ecdsa_sign_fixed(const uint8_t *digest, size_t digest_len, uint8_t *sig,
+                     size_t *out_sig_len, size_t max_sig_len,
+                     const EC_KEY *key);
+
+// ecdsa_sign_fixed_with_nonce_for_known_answer_test behaves like
+// |ecdsa_sign_fixed| but takes a caller-supplied nonce. This function is used
+// as part of known-answer tests in the FIPS module.
+int ecdsa_sign_fixed_with_nonce_for_known_answer_test(
+    const uint8_t *digest, size_t digest_len, uint8_t *sig, size_t *out_sig_len,
+    size_t max_sig_len, const EC_KEY *key, const uint8_t *nonce,
+    size_t nonce_len);
+
+// ecdsa_verify_fixed behaves like |ECDSA_verify| but uses the fixed-width,
+// big-endian format from IEEE P1363.
+int ecdsa_verify_fixed(const uint8_t *digest, size_t digest_len,
+                       const uint8_t *sig, size_t sig_len, const EC_KEY *key);
 
-// ecdsa_do_verify_no_self_test does the same as |ECDSA_do_verify|, but doesn't
+// ecdsa_verify_fixed_no_self_test behaves like ecdsa_verify_fixed, but doesn't
 // try to run the self-test first. This is for use in the self tests themselves,
 // to prevent an infinite loop.
-int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len,
-                                 const ECDSA_SIG *sig, const EC_KEY *eckey);
+int ecdsa_verify_fixed_no_self_test(const uint8_t *digest, size_t digest_len,
+                                    const uint8_t *sig, size_t sig_len,
+                                    const EC_KEY *key);
 
 
 #if defined(__cplusplus)
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/fips_shared_support.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/fips_shared_support.c
index 2a66a1f0..74b35f01 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/fips_shared_support.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/fips_shared_support.c
@@ -20,13 +20,10 @@
 // that must be replaced with the real value during the build process. This
 // value need only be distinct, i.e. so that we can safely search-and-replace it
 // in an object file.
-const uint8_t BORINGSSL_bcm_text_hash[64];
-const uint8_t BORINGSSL_bcm_text_hash[64] = {
+const uint8_t BORINGSSL_bcm_text_hash[32];
+const uint8_t BORINGSSL_bcm_text_hash[32] = {
     0xae, 0x2c, 0xea, 0x2a, 0xbd, 0xa6, 0xf3, 0xec, 0x97, 0x7f, 0x9b,
     0xf6, 0x94, 0x9a, 0xfc, 0x83, 0x68, 0x27, 0xcb, 0xa0, 0xa0, 0x9f,
-    0x6b, 0x6f, 0xde, 0x52, 0xcd, 0xe2, 0xcd, 0xff, 0x31, 0x80, 0xa2,
-    0xd4, 0xc3, 0x66, 0x0f, 0xc2, 0x6a, 0x7b, 0xf4, 0xbe, 0x39, 0xa2,
-    0xd7, 0x25, 0xdb, 0x21, 0x98, 0xe9, 0xd5, 0x53, 0xbf, 0x5c, 0x32,
-    0x06, 0x83, 0x34, 0x0c, 0x65, 0x89, 0x52, 0xbd, 0x1f,
+    0x6b, 0x6f, 0xde, 0x52, 0xcd, 0xe2, 0xcd, 0xff, 0x31, 0x80,
 };
 #endif  // FIPS && SHARED_LIBRARY
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-ios.ios.arm.S
deleted file mode 100644
index d6ddf144..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-ios.ios.arm.S
+++ /dev/null
@@ -1,257 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__APPLE__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <CCryptoBoringSSL_arm_arch.h>
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
-@ instructions are in aesv8-armx.pl.)
-
-
-.text
-#if defined(__thumb2__) || defined(__clang__)
-.syntax	unified
-#define ldrplb  ldrbpl
-#define ldrneb  ldrbne
-#endif
-#if defined(__thumb2__)
-.thumb
-#else
-.code	32
-#endif
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl	_gcm_init_neon
-.private_extern	_gcm_init_neon
-#ifdef __thumb2__
-.thumb_func	_gcm_init_neon
-#endif
-.align	4
-_gcm_init_neon:
-	vld1.64	d7,[r1]!		@ load H
-	vmov.i8	q8,#0xe1
-	vld1.64	d6,[r1]
-	vshl.i64	d17,#57
-	vshr.u64	d16,#63		@ t0=0xc2....01
-	vdup.8	q9,d7[7]
-	vshr.u64	d26,d6,#63
-	vshr.s8	q9,#7			@ broadcast carry bit
-	vshl.i64	q3,q3,#1
-	vand	q8,q8,q9
-	vorr	d7,d26		@ H<<<=1
-	veor	q3,q3,q8		@ twisted H
-	vstmia	r0,{q3}
-
-	bx	lr					@ bx lr
-
-
-.globl	_gcm_gmult_neon
-.private_extern	_gcm_gmult_neon
-#ifdef __thumb2__
-.thumb_func	_gcm_gmult_neon
-#endif
-.align	4
-_gcm_gmult_neon:
-	vld1.64	d7,[r0]!		@ load Xi
-	vld1.64	d6,[r0]!
-	vmov.i64	d29,#0x0000ffffffffffff
-	vldmia	r1,{d26,d27}	@ load twisted H
-	vmov.i64	d30,#0x00000000ffffffff
-#ifdef __ARMEL__
-	vrev64.8	q3,q3
-#endif
-	vmov.i64	d31,#0x000000000000ffff
-	veor	d28,d26,d27		@ Karatsuba pre-processing
-	mov	r3,#16
-	b	Lgmult_neon
-
-
-.globl	_gcm_ghash_neon
-.private_extern	_gcm_ghash_neon
-#ifdef __thumb2__
-.thumb_func	_gcm_ghash_neon
-#endif
-.align	4
-_gcm_ghash_neon:
-	vld1.64	d1,[r0]!		@ load Xi
-	vld1.64	d0,[r0]!
-	vmov.i64	d29,#0x0000ffffffffffff
-	vldmia	r1,{d26,d27}	@ load twisted H
-	vmov.i64	d30,#0x00000000ffffffff
-#ifdef __ARMEL__
-	vrev64.8	q0,q0
-#endif
-	vmov.i64	d31,#0x000000000000ffff
-	veor	d28,d26,d27		@ Karatsuba pre-processing
-
-Loop_neon:
-	vld1.64	d7,[r2]!		@ load inp
-	vld1.64	d6,[r2]!
-#ifdef __ARMEL__
-	vrev64.8	q3,q3
-#endif
-	veor	q3,q0			@ inp^=Xi
-Lgmult_neon:
-	vext.8	d16, d26, d26, #1	@ A1
-	vmull.p8	q8, d16, d6		@ F = A1*B
-	vext.8	d0, d6, d6, #1	@ B1
-	vmull.p8	q0, d26, d0		@ E = A*B1
-	vext.8	d18, d26, d26, #2	@ A2
-	vmull.p8	q9, d18, d6		@ H = A2*B
-	vext.8	d22, d6, d6, #2	@ B2
-	vmull.p8	q11, d26, d22		@ G = A*B2
-	vext.8	d20, d26, d26, #3	@ A3
-	veor	q8, q8, q0		@ L = E + F
-	vmull.p8	q10, d20, d6		@ J = A3*B
-	vext.8	d0, d6, d6, #3	@ B3
-	veor	q9, q9, q11		@ M = G + H
-	vmull.p8	q0, d26, d0		@ I = A*B3
-	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
-	vand	d17, d17, d29
-	vext.8	d22, d6, d6, #4	@ B4
-	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
-	vand	d19, d19, d30
-	vmull.p8	q11, d26, d22		@ K = A*B4
-	veor	q10, q10, q0		@ N = I + J
-	veor	d16, d16, d17
-	veor	d18, d18, d19
-	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
-	vand	d21, d21, d31
-	vext.8	q8, q8, q8, #15
-	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
-	vmov.i64	d23, #0
-	vext.8	q9, q9, q9, #14
-	veor	d20, d20, d21
-	vmull.p8	q0, d26, d6		@ D = A*B
-	vext.8	q11, q11, q11, #12
-	vext.8	q10, q10, q10, #13
-	veor	q8, q8, q9
-	veor	q10, q10, q11
-	veor	q0, q0, q8
-	veor	q0, q0, q10
-	veor	d6,d6,d7	@ Karatsuba pre-processing
-	vext.8	d16, d28, d28, #1	@ A1
-	vmull.p8	q8, d16, d6		@ F = A1*B
-	vext.8	d2, d6, d6, #1	@ B1
-	vmull.p8	q1, d28, d2		@ E = A*B1
-	vext.8	d18, d28, d28, #2	@ A2
-	vmull.p8	q9, d18, d6		@ H = A2*B
-	vext.8	d22, d6, d6, #2	@ B2
-	vmull.p8	q11, d28, d22		@ G = A*B2
-	vext.8	d20, d28, d28, #3	@ A3
-	veor	q8, q8, q1		@ L = E + F
-	vmull.p8	q10, d20, d6		@ J = A3*B
-	vext.8	d2, d6, d6, #3	@ B3
-	veor	q9, q9, q11		@ M = G + H
-	vmull.p8	q1, d28, d2		@ I = A*B3
-	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
-	vand	d17, d17, d29
-	vext.8	d22, d6, d6, #4	@ B4
-	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
-	vand	d19, d19, d30
-	vmull.p8	q11, d28, d22		@ K = A*B4
-	veor	q10, q10, q1		@ N = I + J
-	veor	d16, d16, d17
-	veor	d18, d18, d19
-	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
-	vand	d21, d21, d31
-	vext.8	q8, q8, q8, #15
-	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
-	vmov.i64	d23, #0
-	vext.8	q9, q9, q9, #14
-	veor	d20, d20, d21
-	vmull.p8	q1, d28, d6		@ D = A*B
-	vext.8	q11, q11, q11, #12
-	vext.8	q10, q10, q10, #13
-	veor	q8, q8, q9
-	veor	q10, q10, q11
-	veor	q1, q1, q8
-	veor	q1, q1, q10
-	vext.8	d16, d27, d27, #1	@ A1
-	vmull.p8	q8, d16, d7		@ F = A1*B
-	vext.8	d4, d7, d7, #1	@ B1
-	vmull.p8	q2, d27, d4		@ E = A*B1
-	vext.8	d18, d27, d27, #2	@ A2
-	vmull.p8	q9, d18, d7		@ H = A2*B
-	vext.8	d22, d7, d7, #2	@ B2
-	vmull.p8	q11, d27, d22		@ G = A*B2
-	vext.8	d20, d27, d27, #3	@ A3
-	veor	q8, q8, q2		@ L = E + F
-	vmull.p8	q10, d20, d7		@ J = A3*B
-	vext.8	d4, d7, d7, #3	@ B3
-	veor	q9, q9, q11		@ M = G + H
-	vmull.p8	q2, d27, d4		@ I = A*B3
-	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
-	vand	d17, d17, d29
-	vext.8	d22, d7, d7, #4	@ B4
-	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
-	vand	d19, d19, d30
-	vmull.p8	q11, d27, d22		@ K = A*B4
-	veor	q10, q10, q2		@ N = I + J
-	veor	d16, d16, d17
-	veor	d18, d18, d19
-	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
-	vand	d21, d21, d31
-	vext.8	q8, q8, q8, #15
-	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
-	vmov.i64	d23, #0
-	vext.8	q9, q9, q9, #14
-	veor	d20, d20, d21
-	vmull.p8	q2, d27, d7		@ D = A*B
-	vext.8	q11, q11, q11, #12
-	vext.8	q10, q10, q10, #13
-	veor	q8, q8, q9
-	veor	q10, q10, q11
-	veor	q2, q2, q8
-	veor	q2, q2, q10
-	veor	q1,q1,q0		@ Karatsuba post-processing
-	veor	q1,q1,q2
-	veor	d1,d1,d2
-	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
-
-	@ equivalent of reduction_avx from ghash-x86_64.pl
-	vshl.i64	q9,q0,#57		@ 1st phase
-	vshl.i64	q10,q0,#62
-	veor	q10,q10,q9		@
-	vshl.i64	q9,q0,#63
-	veor	q10, q10, q9		@
-	veor	d1,d1,d20	@
-	veor	d4,d4,d21
-
-	vshr.u64	q10,q0,#1		@ 2nd phase
-	veor	q2,q2,q0
-	veor	q0,q0,q10		@
-	vshr.u64	q10,q10,#6
-	vshr.u64	q0,q0,#1		@
-	veor	q0,q0,q2		@
-	veor	q0,q0,q10		@
-
-	subs	r3,#16
-	bne	Loop_neon
-
-#ifdef __ARMEL__
-	vrev64.8	q0,q0
-#endif
-	sub	r0,#16
-	vst1.64	d1,[r0]!		@ write out Xi
-	vst1.64	d0,[r0]
-
-	bx	lr					@ bx lr
-
-#endif
-.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
-#endif  // defined(__arm__) && defined(__APPLE__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-windows.windows.x86.S
deleted file mode 100644
index 43d7bc66..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-windows.windows.x86.S
+++ /dev/null
@@ -1,304 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-global	_gcm_gmult_ssse3
-align	16
-_gcm_gmult_ssse3:
-L$_gcm_gmult_ssse3_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	edi,DWORD [20+esp]
-	mov	esi,DWORD [24+esp]
-	movdqu	xmm0,[edi]
-	call	L$000pic_point
-L$000pic_point:
-	pop	eax
-	movdqa	xmm7,[(L$reverse_bytes-L$000pic_point)+eax]
-	movdqa	xmm2,[(L$low4_mask-L$000pic_point)+eax]
-db	102,15,56,0,199
-	movdqa	xmm1,xmm2
-	pandn	xmm1,xmm0
-	psrld	xmm1,4
-	pand	xmm0,xmm2
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	mov	eax,5
-L$001loop_row_1:
-	movdqa	xmm4,[esi]
-	lea	esi,[16+esi]
-	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
-	movdqa	xmm3,xmm6
-	psrldq	xmm2,1
-	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
-	pxor	xmm2,xmm5
-	movdqa	xmm5,xmm4
-	psllq	xmm5,60
-	movdqa	xmm6,xmm5
-	pslldq	xmm6,8
-	pxor	xmm3,xmm6
-	psrldq	xmm5,8
-	pxor	xmm2,xmm5
-	psrlq	xmm4,4
-	pxor	xmm2,xmm4
-	sub	eax,1
-	jnz	NEAR L$001loop_row_1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,5
-	pxor	xmm2,xmm3
-	pxor	xmm3,xmm3
-	mov	eax,5
-L$002loop_row_2:
-	movdqa	xmm4,[esi]
-	lea	esi,[16+esi]
-	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
-	movdqa	xmm3,xmm6
-	psrldq	xmm2,1
-	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
-	pxor	xmm2,xmm5
-	movdqa	xmm5,xmm4
-	psllq	xmm5,60
-	movdqa	xmm6,xmm5
-	pslldq	xmm6,8
-	pxor	xmm3,xmm6
-	psrldq	xmm5,8
-	pxor	xmm2,xmm5
-	psrlq	xmm4,4
-	pxor	xmm2,xmm4
-	sub	eax,1
-	jnz	NEAR L$002loop_row_2
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,5
-	pxor	xmm2,xmm3
-	pxor	xmm3,xmm3
-	mov	eax,6
-L$003loop_row_3:
-	movdqa	xmm4,[esi]
-	lea	esi,[16+esi]
-	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
-	movdqa	xmm3,xmm6
-	psrldq	xmm2,1
-	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
-	pxor	xmm2,xmm5
-	movdqa	xmm5,xmm4
-	psllq	xmm5,60
-	movdqa	xmm6,xmm5
-	pslldq	xmm6,8
-	pxor	xmm3,xmm6
-	psrldq	xmm5,8
-	pxor	xmm2,xmm5
-	psrlq	xmm4,4
-	pxor	xmm2,xmm4
-	sub	eax,1
-	jnz	NEAR L$003loop_row_3
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,5
-	pxor	xmm2,xmm3
-	pxor	xmm3,xmm3
-db	102,15,56,0,215
-	movdqu	[edi],xmm2
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	pxor	xmm4,xmm4
-	pxor	xmm5,xmm5
-	pxor	xmm6,xmm6
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_gcm_ghash_ssse3
-align	16
-_gcm_ghash_ssse3:
-L$_gcm_ghash_ssse3_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	edi,DWORD [20+esp]
-	mov	esi,DWORD [24+esp]
-	mov	edx,DWORD [28+esp]
-	mov	ecx,DWORD [32+esp]
-	movdqu	xmm0,[edi]
-	call	L$004pic_point
-L$004pic_point:
-	pop	ebx
-	movdqa	xmm7,[(L$reverse_bytes-L$004pic_point)+ebx]
-	and	ecx,-16
-db	102,15,56,0,199
-	pxor	xmm3,xmm3
-L$005loop_ghash:
-	movdqa	xmm2,[(L$low4_mask-L$004pic_point)+ebx]
-	movdqu	xmm1,[edx]
-db	102,15,56,0,207
-	pxor	xmm0,xmm1
-	movdqa	xmm1,xmm2
-	pandn	xmm1,xmm0
-	psrld	xmm1,4
-	pand	xmm0,xmm2
-	pxor	xmm2,xmm2
-	mov	eax,5
-L$006loop_row_4:
-	movdqa	xmm4,[esi]
-	lea	esi,[16+esi]
-	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
-	movdqa	xmm3,xmm6
-	psrldq	xmm2,1
-	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
-	pxor	xmm2,xmm5
-	movdqa	xmm5,xmm4
-	psllq	xmm5,60
-	movdqa	xmm6,xmm5
-	pslldq	xmm6,8
-	pxor	xmm3,xmm6
-	psrldq	xmm5,8
-	pxor	xmm2,xmm5
-	psrlq	xmm4,4
-	pxor	xmm2,xmm4
-	sub	eax,1
-	jnz	NEAR L$006loop_row_4
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,5
-	pxor	xmm2,xmm3
-	pxor	xmm3,xmm3
-	mov	eax,5
-L$007loop_row_5:
-	movdqa	xmm4,[esi]
-	lea	esi,[16+esi]
-	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
-	movdqa	xmm3,xmm6
-	psrldq	xmm2,1
-	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
-	pxor	xmm2,xmm5
-	movdqa	xmm5,xmm4
-	psllq	xmm5,60
-	movdqa	xmm6,xmm5
-	pslldq	xmm6,8
-	pxor	xmm3,xmm6
-	psrldq	xmm5,8
-	pxor	xmm2,xmm5
-	psrlq	xmm4,4
-	pxor	xmm2,xmm4
-	sub	eax,1
-	jnz	NEAR L$007loop_row_5
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,5
-	pxor	xmm2,xmm3
-	pxor	xmm3,xmm3
-	mov	eax,6
-L$008loop_row_6:
-	movdqa	xmm4,[esi]
-	lea	esi,[16+esi]
-	movdqa	xmm6,xmm2
-db	102,15,58,15,243,1
-	movdqa	xmm3,xmm6
-	psrldq	xmm2,1
-	movdqa	xmm5,xmm4
-db	102,15,56,0,224
-db	102,15,56,0,233
-	pxor	xmm2,xmm5
-	movdqa	xmm5,xmm4
-	psllq	xmm5,60
-	movdqa	xmm6,xmm5
-	pslldq	xmm6,8
-	pxor	xmm3,xmm6
-	psrldq	xmm5,8
-	pxor	xmm2,xmm5
-	psrlq	xmm4,4
-	pxor	xmm2,xmm4
-	sub	eax,1
-	jnz	NEAR L$008loop_row_6
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,1
-	pxor	xmm2,xmm3
-	psrlq	xmm3,5
-	pxor	xmm2,xmm3
-	pxor	xmm3,xmm3
-	movdqa	xmm0,xmm2
-	lea	esi,[esi-256]
-	lea	edx,[16+edx]
-	sub	ecx,16
-	jnz	NEAR L$005loop_ghash
-db	102,15,56,0,199
-	movdqu	[edi],xmm0
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	pxor	xmm4,xmm4
-	pxor	xmm5,xmm5
-	pxor	xmm6,xmm6
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-align	16
-L$reverse_bytes:
-db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-align	16
-L$low4_mask:
-dd	252645135,252645135,252645135,252645135
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-windows.windows.x86.S
deleted file mode 100644
index a47ab266..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-windows.windows.x86.S
+++ /dev/null
@@ -1,337 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-global	_gcm_init_clmul
-align	16
-_gcm_init_clmul:
-L$_gcm_init_clmul_begin:
-	mov	edx,DWORD [4+esp]
-	mov	eax,DWORD [8+esp]
-	call	L$000pic
-L$000pic:
-	pop	ecx
-	lea	ecx,[(L$bswap-L$000pic)+ecx]
-	movdqu	xmm2,[eax]
-	pshufd	xmm2,xmm2,78
-	pshufd	xmm4,xmm2,255
-	movdqa	xmm3,xmm2
-	psllq	xmm2,1
-	pxor	xmm5,xmm5
-	psrlq	xmm3,63
-	pcmpgtd	xmm5,xmm4
-	pslldq	xmm3,8
-	por	xmm2,xmm3
-	pand	xmm5,[16+ecx]
-	pxor	xmm2,xmm5
-	movdqa	xmm0,xmm2
-	movdqa	xmm1,xmm0
-	pshufd	xmm3,xmm0,78
-	pshufd	xmm4,xmm2,78
-	pxor	xmm3,xmm0
-	pxor	xmm4,xmm2
-db	102,15,58,68,194,0
-db	102,15,58,68,202,17
-db	102,15,58,68,220,0
-	xorps	xmm3,xmm0
-	xorps	xmm3,xmm1
-	movdqa	xmm4,xmm3
-	psrldq	xmm3,8
-	pslldq	xmm4,8
-	pxor	xmm1,xmm3
-	pxor	xmm0,xmm4
-	movdqa	xmm4,xmm0
-	movdqa	xmm3,xmm0
-	psllq	xmm0,5
-	pxor	xmm3,xmm0
-	psllq	xmm0,1
-	pxor	xmm0,xmm3
-	psllq	xmm0,57
-	movdqa	xmm3,xmm0
-	pslldq	xmm0,8
-	psrldq	xmm3,8
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm3
-	movdqa	xmm4,xmm0
-	psrlq	xmm0,1
-	pxor	xmm1,xmm4
-	pxor	xmm4,xmm0
-	psrlq	xmm0,5
-	pxor	xmm0,xmm4
-	psrlq	xmm0,1
-	pxor	xmm0,xmm1
-	pshufd	xmm3,xmm2,78
-	pshufd	xmm4,xmm0,78
-	pxor	xmm3,xmm2
-	movdqu	[edx],xmm2
-	pxor	xmm4,xmm0
-	movdqu	[16+edx],xmm0
-db	102,15,58,15,227,8
-	movdqu	[32+edx],xmm4
-	ret
-global	_gcm_gmult_clmul
-align	16
-_gcm_gmult_clmul:
-L$_gcm_gmult_clmul_begin:
-	mov	eax,DWORD [4+esp]
-	mov	edx,DWORD [8+esp]
-	call	L$001pic
-L$001pic:
-	pop	ecx
-	lea	ecx,[(L$bswap-L$001pic)+ecx]
-	movdqu	xmm0,[eax]
-	movdqa	xmm5,[ecx]
-	movups	xmm2,[edx]
-db	102,15,56,0,197
-	movups	xmm4,[32+edx]
-	movdqa	xmm1,xmm0
-	pshufd	xmm3,xmm0,78
-	pxor	xmm3,xmm0
-db	102,15,58,68,194,0
-db	102,15,58,68,202,17
-db	102,15,58,68,220,0
-	xorps	xmm3,xmm0
-	xorps	xmm3,xmm1
-	movdqa	xmm4,xmm3
-	psrldq	xmm3,8
-	pslldq	xmm4,8
-	pxor	xmm1,xmm3
-	pxor	xmm0,xmm4
-	movdqa	xmm4,xmm0
-	movdqa	xmm3,xmm0
-	psllq	xmm0,5
-	pxor	xmm3,xmm0
-	psllq	xmm0,1
-	pxor	xmm0,xmm3
-	psllq	xmm0,57
-	movdqa	xmm3,xmm0
-	pslldq	xmm0,8
-	psrldq	xmm3,8
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm3
-	movdqa	xmm4,xmm0
-	psrlq	xmm0,1
-	pxor	xmm1,xmm4
-	pxor	xmm4,xmm0
-	psrlq	xmm0,5
-	pxor	xmm0,xmm4
-	psrlq	xmm0,1
-	pxor	xmm0,xmm1
-db	102,15,56,0,197
-	movdqu	[eax],xmm0
-	ret
-global	_gcm_ghash_clmul
-align	16
-_gcm_ghash_clmul:
-L$_gcm_ghash_clmul_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	eax,DWORD [20+esp]
-	mov	edx,DWORD [24+esp]
-	mov	esi,DWORD [28+esp]
-	mov	ebx,DWORD [32+esp]
-	call	L$002pic
-L$002pic:
-	pop	ecx
-	lea	ecx,[(L$bswap-L$002pic)+ecx]
-	movdqu	xmm0,[eax]
-	movdqa	xmm5,[ecx]
-	movdqu	xmm2,[edx]
-db	102,15,56,0,197
-	sub	ebx,16
-	jz	NEAR L$003odd_tail
-	movdqu	xmm3,[esi]
-	movdqu	xmm6,[16+esi]
-db	102,15,56,0,221
-db	102,15,56,0,245
-	movdqu	xmm5,[32+edx]
-	pxor	xmm0,xmm3
-	pshufd	xmm3,xmm6,78
-	movdqa	xmm7,xmm6
-	pxor	xmm3,xmm6
-	lea	esi,[32+esi]
-db	102,15,58,68,242,0
-db	102,15,58,68,250,17
-db	102,15,58,68,221,0
-	movups	xmm2,[16+edx]
-	nop
-	sub	ebx,32
-	jbe	NEAR L$004even_tail
-	jmp	NEAR L$005mod_loop
-align	32
-L$005mod_loop:
-	pshufd	xmm4,xmm0,78
-	movdqa	xmm1,xmm0
-	pxor	xmm4,xmm0
-	nop
-db	102,15,58,68,194,0
-db	102,15,58,68,202,17
-db	102,15,58,68,229,16
-	movups	xmm2,[edx]
-	xorps	xmm0,xmm6
-	movdqa	xmm5,[ecx]
-	xorps	xmm1,xmm7
-	movdqu	xmm7,[esi]
-	pxor	xmm3,xmm0
-	movdqu	xmm6,[16+esi]
-	pxor	xmm3,xmm1
-db	102,15,56,0,253
-	pxor	xmm4,xmm3
-	movdqa	xmm3,xmm4
-	psrldq	xmm4,8
-	pslldq	xmm3,8
-	pxor	xmm1,xmm4
-	pxor	xmm0,xmm3
-db	102,15,56,0,245
-	pxor	xmm1,xmm7
-	movdqa	xmm7,xmm6
-	movdqa	xmm4,xmm0
-	movdqa	xmm3,xmm0
-	psllq	xmm0,5
-	pxor	xmm3,xmm0
-	psllq	xmm0,1
-	pxor	xmm0,xmm3
-db	102,15,58,68,242,0
-	movups	xmm5,[32+edx]
-	psllq	xmm0,57
-	movdqa	xmm3,xmm0
-	pslldq	xmm0,8
-	psrldq	xmm3,8
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm3
-	pshufd	xmm3,xmm7,78
-	movdqa	xmm4,xmm0
-	psrlq	xmm0,1
-	pxor	xmm3,xmm7
-	pxor	xmm1,xmm4
-db	102,15,58,68,250,17
-	movups	xmm2,[16+edx]
-	pxor	xmm4,xmm0
-	psrlq	xmm0,5
-	pxor	xmm0,xmm4
-	psrlq	xmm0,1
-	pxor	xmm0,xmm1
-db	102,15,58,68,221,0
-	lea	esi,[32+esi]
-	sub	ebx,32
-	ja	NEAR L$005mod_loop
-L$004even_tail:
-	pshufd	xmm4,xmm0,78
-	movdqa	xmm1,xmm0
-	pxor	xmm4,xmm0
-db	102,15,58,68,194,0
-db	102,15,58,68,202,17
-db	102,15,58,68,229,16
-	movdqa	xmm5,[ecx]
-	xorps	xmm0,xmm6
-	xorps	xmm1,xmm7
-	pxor	xmm3,xmm0
-	pxor	xmm3,xmm1
-	pxor	xmm4,xmm3
-	movdqa	xmm3,xmm4
-	psrldq	xmm4,8
-	pslldq	xmm3,8
-	pxor	xmm1,xmm4
-	pxor	xmm0,xmm3
-	movdqa	xmm4,xmm0
-	movdqa	xmm3,xmm0
-	psllq	xmm0,5
-	pxor	xmm3,xmm0
-	psllq	xmm0,1
-	pxor	xmm0,xmm3
-	psllq	xmm0,57
-	movdqa	xmm3,xmm0
-	pslldq	xmm0,8
-	psrldq	xmm3,8
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm3
-	movdqa	xmm4,xmm0
-	psrlq	xmm0,1
-	pxor	xmm1,xmm4
-	pxor	xmm4,xmm0
-	psrlq	xmm0,5
-	pxor	xmm0,xmm4
-	psrlq	xmm0,1
-	pxor	xmm0,xmm1
-	test	ebx,ebx
-	jnz	NEAR L$006done
-	movups	xmm2,[edx]
-L$003odd_tail:
-	movdqu	xmm3,[esi]
-db	102,15,56,0,221
-	pxor	xmm0,xmm3
-	movdqa	xmm1,xmm0
-	pshufd	xmm3,xmm0,78
-	pshufd	xmm4,xmm2,78
-	pxor	xmm3,xmm0
-	pxor	xmm4,xmm2
-db	102,15,58,68,194,0
-db	102,15,58,68,202,17
-db	102,15,58,68,220,0
-	xorps	xmm3,xmm0
-	xorps	xmm3,xmm1
-	movdqa	xmm4,xmm3
-	psrldq	xmm3,8
-	pslldq	xmm4,8
-	pxor	xmm1,xmm3
-	pxor	xmm0,xmm4
-	movdqa	xmm4,xmm0
-	movdqa	xmm3,xmm0
-	psllq	xmm0,5
-	pxor	xmm3,xmm0
-	psllq	xmm0,1
-	pxor	xmm0,xmm3
-	psllq	xmm0,57
-	movdqa	xmm3,xmm0
-	pslldq	xmm0,8
-	psrldq	xmm3,8
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm3
-	movdqa	xmm4,xmm0
-	psrlq	xmm0,1
-	pxor	xmm1,xmm4
-	pxor	xmm4,xmm0
-	psrlq	xmm0,5
-	pxor	xmm0,xmm4
-	psrlq	xmm0,1
-	pxor	xmm0,xmm1
-L$006done:
-db	102,15,56,0,197
-	movdqu	[eax],xmm0
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-align	64
-L$bswap:
-db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-db	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
-db	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
-db	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
-db	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
-db	0
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-ios.ios.arm.S
deleted file mode 100644
index 2c9d309e..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-ios.ios.arm.S
+++ /dev/null
@@ -1,259 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__APPLE__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <CCryptoBoringSSL_arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-.code	32
-#undef	__thumb2__
-.globl	_gcm_init_v8
-.private_extern	_gcm_init_v8
-#ifdef __thumb2__
-.thumb_func	_gcm_init_v8
-#endif
-.align	4
-_gcm_init_v8:
-	AARCH64_VALID_CALL_TARGET
-	vld1.64	{q9},[r1]		@ load input H
-	vmov.i8	q11,#0xe1
-	vshl.i64	q11,q11,#57		@ 0xc2.0
-	vext.8	q3,q9,q9,#8
-	vshr.u64	q10,q11,#63
-	vdup.32	q9,d18[1]
-	vext.8	q8,q10,q11,#8		@ t0=0xc2....01
-	vshr.u64	q10,q3,#63
-	vshr.s32	q9,q9,#31		@ broadcast carry bit
-	vand	q10,q10,q8
-	vshl.i64	q3,q3,#1
-	vext.8	q10,q10,q10,#8
-	vand	q8,q8,q9
-	vorr	q3,q3,q10		@ H<<<=1
-	veor	q12,q3,q8		@ twisted H
-	vst1.64	{q12},[r0]!		@ store Htable[0]
-
-	@ calculate H^2
-	vext.8	q8,q12,q12,#8		@ Karatsuba pre-processing
-.byte	0xa8,0x0e,0xa8,0xf2	@ pmull q0,q12,q12
-	veor	q8,q8,q12
-.byte	0xa9,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q12
-.byte	0xa0,0x2e,0xa0,0xf2	@ pmull q1,q8,q8
-
-	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
-	veor	q10,q0,q2
-	veor	q1,q1,q9
-	veor	q1,q1,q10
-.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase
-
-	vmov	d4,d3		@ Xh|Xm - 256-bit result
-	vmov	d3,d0		@ Xm is rotated Xl
-	veor	q0,q1,q10
-
-	vext.8	q10,q0,q0,#8		@ 2nd phase
-.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
-	veor	q10,q10,q2
-	veor	q14,q0,q10
-
-	vext.8	q9,q14,q14,#8		@ Karatsuba pre-processing
-	veor	q9,q9,q14
-	vext.8	q13,q8,q9,#8		@ pack Karatsuba pre-processed
-	vst1.64	{q13,q14},[r0]!	@ store Htable[1..2]
-	bx	lr
-
-.globl	_gcm_gmult_v8
-.private_extern	_gcm_gmult_v8
-#ifdef __thumb2__
-.thumb_func	_gcm_gmult_v8
-#endif
-.align	4
-_gcm_gmult_v8:
-	AARCH64_VALID_CALL_TARGET
-	vld1.64	{q9},[r0]		@ load Xi
-	vmov.i8	q11,#0xe1
-	vld1.64	{q12,q13},[r1]	@ load twisted H, ...
-	vshl.u64	q11,q11,#57
-#ifndef __ARMEB__
-	vrev64.8	q9,q9
-#endif
-	vext.8	q3,q9,q9,#8
-
-.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
-	veor	q9,q9,q3		@ Karatsuba pre-processing
-.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
-.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
-
-	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
-	veor	q10,q0,q2
-	veor	q1,q1,q9
-	veor	q1,q1,q10
-.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
-
-	vmov	d4,d3		@ Xh|Xm - 256-bit result
-	vmov	d3,d0		@ Xm is rotated Xl
-	veor	q0,q1,q10
-
-	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
-.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
-	veor	q10,q10,q2
-	veor	q0,q0,q10
-
-#ifndef __ARMEB__
-	vrev64.8	q0,q0
-#endif
-	vext.8	q0,q0,q0,#8
-	vst1.64	{q0},[r0]		@ write out Xi
-
-	bx	lr
-
-.globl	_gcm_ghash_v8
-.private_extern	_gcm_ghash_v8
-#ifdef __thumb2__
-.thumb_func	_gcm_ghash_v8
-#endif
-.align	4
-_gcm_ghash_v8:
-	AARCH64_VALID_CALL_TARGET
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
-	vld1.64	{q0},[r0]		@ load [rotated] Xi
-						@ "[rotated]" means that
-						@ loaded value would have
-						@ to be rotated in order to
-						@ make it appear as in
-						@ algorithm specification
-	subs	r3,r3,#32		@ see if r3 is 32 or larger
-	mov	r12,#16		@ r12 is used as post-
-						@ increment for input pointer;
-						@ as loop is modulo-scheduled
-						@ r12 is zeroed just in time
-						@ to preclude overstepping
-						@ inp[len], which means that
-						@ last block[s] are actually
-						@ loaded twice, but last
-						@ copy is not processed
-	vld1.64	{q12,q13},[r1]!	@ load twisted H, ..., H^2
-	vmov.i8	q11,#0xe1
-	vld1.64	{q14},[r1]
-	moveq	r12,#0			@ is it time to zero r12?
-	vext.8	q0,q0,q0,#8		@ rotate Xi
-	vld1.64	{q8},[r2]!	@ load [rotated] I[0]
-	vshl.u64	q11,q11,#57		@ compose 0xc2.0 constant
-#ifndef __ARMEB__
-	vrev64.8	q8,q8
-	vrev64.8	q0,q0
-#endif
-	vext.8	q3,q8,q8,#8		@ rotate I[0]
-	blo	Lodd_tail_v8		@ r3 was less than 32
-	vld1.64	{q9},[r2],r12	@ load [rotated] I[1]
-#ifndef __ARMEB__
-	vrev64.8	q9,q9
-#endif
-	vext.8	q7,q9,q9,#8
-	veor	q3,q3,q0		@ I[i]^=Xi
-.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
-	veor	q9,q9,q7		@ Karatsuba pre-processing
-.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
-	b	Loop_mod2x_v8
-
-.align	4
-Loop_mod2x_v8:
-	vext.8	q10,q3,q3,#8
-	subs	r3,r3,#32		@ is there more data?
-.byte	0x86,0x0e,0xac,0xf2	@ pmull q0,q14,q3		@ H^2.lo·Xi.lo
-	movlo	r12,#0			@ is it time to zero r12?
-
-.byte	0xa2,0xae,0xaa,0xf2	@ pmull q5,q13,q9
-	veor	q10,q10,q3		@ Karatsuba pre-processing
-.byte	0x87,0x4e,0xad,0xf2	@ pmull2 q2,q14,q3		@ H^2.hi·Xi.hi
-	veor	q0,q0,q4		@ accumulate
-.byte	0xa5,0x2e,0xab,0xf2	@ pmull2 q1,q13,q10		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
-	vld1.64	{q8},[r2],r12	@ load [rotated] I[i+2]
-
-	veor	q2,q2,q6
-	moveq	r12,#0			@ is it time to zero r12?
-	veor	q1,q1,q5
-
-	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
-	veor	q10,q0,q2
-	veor	q1,q1,q9
-	vld1.64	{q9},[r2],r12	@ load [rotated] I[i+3]
-#ifndef __ARMEB__
-	vrev64.8	q8,q8
-#endif
-	veor	q1,q1,q10
-.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
-
-#ifndef __ARMEB__
-	vrev64.8	q9,q9
-#endif
-	vmov	d4,d3		@ Xh|Xm - 256-bit result
-	vmov	d3,d0		@ Xm is rotated Xl
-	vext.8	q7,q9,q9,#8
-	vext.8	q3,q8,q8,#8
-	veor	q0,q1,q10
-.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
-	veor	q3,q3,q2		@ accumulate q3 early
-
-	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
-.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
-	veor	q3,q3,q10
-	veor	q9,q9,q7		@ Karatsuba pre-processing
-	veor	q3,q3,q0
-.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
-	bhs	Loop_mod2x_v8		@ there was at least 32 more bytes
-
-	veor	q2,q2,q10
-	vext.8	q3,q8,q8,#8		@ re-construct q3
-	adds	r3,r3,#32		@ re-construct r3
-	veor	q0,q0,q2		@ re-construct q0
-	beq	Ldone_v8		@ is r3 zero?
-Lodd_tail_v8:
-	vext.8	q10,q0,q0,#8
-	veor	q3,q3,q0		@ inp^=Xi
-	veor	q9,q8,q10		@ q9 is rotated inp^Xi
-
-.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
-	veor	q9,q9,q3		@ Karatsuba pre-processing
-.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
-.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
-
-	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
-	veor	q10,q0,q2
-	veor	q1,q1,q9
-	veor	q1,q1,q10
-.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
-
-	vmov	d4,d3		@ Xh|Xm - 256-bit result
-	vmov	d3,d0		@ Xm is rotated Xl
-	veor	q0,q1,q10
-
-	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
-.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
-	veor	q10,q10,q2
-	veor	q0,q0,q10
-
-Ldone_v8:
-#ifndef __ARMEB__
-	vrev64.8	q0,q0
-#endif
-	vext.8	q0,q0,q0,#8
-	vst1.64	{q0},[r0]		@ write out Xi
-
-	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
-	bx	lr
-
-.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
-#endif  // defined(__arm__) && defined(__APPLE__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/hkdf/hkdf.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/hkdf/hkdf.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/hkdf/hkdf.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/hkdf/hkdf.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/hmac/hmac.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/hmac/hmac.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/hmac/hmac.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/hmac/hmac.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-windows.windows.x86.S
deleted file mode 100644
index e6812534..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-windows.windows.x86.S
+++ /dev/null
@@ -1,701 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-global	_md5_block_asm_data_order
-align	16
-_md5_block_asm_data_order:
-L$_md5_block_asm_data_order_begin:
-	push	esi
-	push	edi
-	mov	edi,DWORD [12+esp]
-	mov	esi,DWORD [16+esp]
-	mov	ecx,DWORD [20+esp]
-	push	ebp
-	shl	ecx,6
-	push	ebx
-	add	ecx,esi
-	sub	ecx,64
-	mov	eax,DWORD [edi]
-	push	ecx
-	mov	ebx,DWORD [4+edi]
-	mov	ecx,DWORD [8+edi]
-	mov	edx,DWORD [12+edi]
-L$000start:
-	; 
-	; R0 section
-	mov	edi,ecx
-	mov	ebp,DWORD [esi]
-	; R0 0
-	xor	edi,edx
-	and	edi,ebx
-	lea	eax,[3614090360+ebp*1+eax]
-	xor	edi,edx
-	add	eax,edi
-	mov	edi,ebx
-	rol	eax,7
-	mov	ebp,DWORD [4+esi]
-	add	eax,ebx
-	; R0 1
-	xor	edi,ecx
-	and	edi,eax
-	lea	edx,[3905402710+ebp*1+edx]
-	xor	edi,ecx
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,12
-	mov	ebp,DWORD [8+esi]
-	add	edx,eax
-	; R0 2
-	xor	edi,ebx
-	and	edi,edx
-	lea	ecx,[606105819+ebp*1+ecx]
-	xor	edi,ebx
-	add	ecx,edi
-	mov	edi,edx
-	rol	ecx,17
-	mov	ebp,DWORD [12+esi]
-	add	ecx,edx
-	; R0 3
-	xor	edi,eax
-	and	edi,ecx
-	lea	ebx,[3250441966+ebp*1+ebx]
-	xor	edi,eax
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,22
-	mov	ebp,DWORD [16+esi]
-	add	ebx,ecx
-	; R0 4
-	xor	edi,edx
-	and	edi,ebx
-	lea	eax,[4118548399+ebp*1+eax]
-	xor	edi,edx
-	add	eax,edi
-	mov	edi,ebx
-	rol	eax,7
-	mov	ebp,DWORD [20+esi]
-	add	eax,ebx
-	; R0 5
-	xor	edi,ecx
-	and	edi,eax
-	lea	edx,[1200080426+ebp*1+edx]
-	xor	edi,ecx
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,12
-	mov	ebp,DWORD [24+esi]
-	add	edx,eax
-	; R0 6
-	xor	edi,ebx
-	and	edi,edx
-	lea	ecx,[2821735955+ebp*1+ecx]
-	xor	edi,ebx
-	add	ecx,edi
-	mov	edi,edx
-	rol	ecx,17
-	mov	ebp,DWORD [28+esi]
-	add	ecx,edx
-	; R0 7
-	xor	edi,eax
-	and	edi,ecx
-	lea	ebx,[4249261313+ebp*1+ebx]
-	xor	edi,eax
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,22
-	mov	ebp,DWORD [32+esi]
-	add	ebx,ecx
-	; R0 8
-	xor	edi,edx
-	and	edi,ebx
-	lea	eax,[1770035416+ebp*1+eax]
-	xor	edi,edx
-	add	eax,edi
-	mov	edi,ebx
-	rol	eax,7
-	mov	ebp,DWORD [36+esi]
-	add	eax,ebx
-	; R0 9
-	xor	edi,ecx
-	and	edi,eax
-	lea	edx,[2336552879+ebp*1+edx]
-	xor	edi,ecx
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,12
-	mov	ebp,DWORD [40+esi]
-	add	edx,eax
-	; R0 10
-	xor	edi,ebx
-	and	edi,edx
-	lea	ecx,[4294925233+ebp*1+ecx]
-	xor	edi,ebx
-	add	ecx,edi
-	mov	edi,edx
-	rol	ecx,17
-	mov	ebp,DWORD [44+esi]
-	add	ecx,edx
-	; R0 11
-	xor	edi,eax
-	and	edi,ecx
-	lea	ebx,[2304563134+ebp*1+ebx]
-	xor	edi,eax
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,22
-	mov	ebp,DWORD [48+esi]
-	add	ebx,ecx
-	; R0 12
-	xor	edi,edx
-	and	edi,ebx
-	lea	eax,[1804603682+ebp*1+eax]
-	xor	edi,edx
-	add	eax,edi
-	mov	edi,ebx
-	rol	eax,7
-	mov	ebp,DWORD [52+esi]
-	add	eax,ebx
-	; R0 13
-	xor	edi,ecx
-	and	edi,eax
-	lea	edx,[4254626195+ebp*1+edx]
-	xor	edi,ecx
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,12
-	mov	ebp,DWORD [56+esi]
-	add	edx,eax
-	; R0 14
-	xor	edi,ebx
-	and	edi,edx
-	lea	ecx,[2792965006+ebp*1+ecx]
-	xor	edi,ebx
-	add	ecx,edi
-	mov	edi,edx
-	rol	ecx,17
-	mov	ebp,DWORD [60+esi]
-	add	ecx,edx
-	; R0 15
-	xor	edi,eax
-	and	edi,ecx
-	lea	ebx,[1236535329+ebp*1+ebx]
-	xor	edi,eax
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,22
-	mov	ebp,DWORD [4+esi]
-	add	ebx,ecx
-	; 
-	; R1 section
-	; R1 16
-	lea	eax,[4129170786+ebp*1+eax]
-	xor	edi,ebx
-	and	edi,edx
-	mov	ebp,DWORD [24+esi]
-	xor	edi,ecx
-	add	eax,edi
-	mov	edi,ebx
-	rol	eax,5
-	add	eax,ebx
-	; R1 17
-	lea	edx,[3225465664+ebp*1+edx]
-	xor	edi,eax
-	and	edi,ecx
-	mov	ebp,DWORD [44+esi]
-	xor	edi,ebx
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,9
-	add	edx,eax
-	; R1 18
-	lea	ecx,[643717713+ebp*1+ecx]
-	xor	edi,edx
-	and	edi,ebx
-	mov	ebp,DWORD [esi]
-	xor	edi,eax
-	add	ecx,edi
-	mov	edi,edx
-	rol	ecx,14
-	add	ecx,edx
-	; R1 19
-	lea	ebx,[3921069994+ebp*1+ebx]
-	xor	edi,ecx
-	and	edi,eax
-	mov	ebp,DWORD [20+esi]
-	xor	edi,edx
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,20
-	add	ebx,ecx
-	; R1 20
-	lea	eax,[3593408605+ebp*1+eax]
-	xor	edi,ebx
-	and	edi,edx
-	mov	ebp,DWORD [40+esi]
-	xor	edi,ecx
-	add	eax,edi
-	mov	edi,ebx
-	rol	eax,5
-	add	eax,ebx
-	; R1 21
-	lea	edx,[38016083+ebp*1+edx]
-	xor	edi,eax
-	and	edi,ecx
-	mov	ebp,DWORD [60+esi]
-	xor	edi,ebx
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,9
-	add	edx,eax
-	; R1 22
-	lea	ecx,[3634488961+ebp*1+ecx]
-	xor	edi,edx
-	and	edi,ebx
-	mov	ebp,DWORD [16+esi]
-	xor	edi,eax
-	add	ecx,edi
-	mov	edi,edx
-	rol	ecx,14
-	add	ecx,edx
-	; R1 23
-	lea	ebx,[3889429448+ebp*1+ebx]
-	xor	edi,ecx
-	and	edi,eax
-	mov	ebp,DWORD [36+esi]
-	xor	edi,edx
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,20
-	add	ebx,ecx
-	; R1 24
-	lea	eax,[568446438+ebp*1+eax]
-	xor	edi,ebx
-	and	edi,edx
-	mov	ebp,DWORD [56+esi]
-	xor	edi,ecx
-	add	eax,edi
-	mov	edi,ebx
-	rol	eax,5
-	add	eax,ebx
-	; R1 25
-	lea	edx,[3275163606+ebp*1+edx]
-	xor	edi,eax
-	and	edi,ecx
-	mov	ebp,DWORD [12+esi]
-	xor	edi,ebx
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,9
-	add	edx,eax
-	; R1 26
-	lea	ecx,[4107603335+ebp*1+ecx]
-	xor	edi,edx
-	and	edi,ebx
-	mov	ebp,DWORD [32+esi]
-	xor	edi,eax
-	add	ecx,edi
-	mov	edi,edx
-	rol	ecx,14
-	add	ecx,edx
-	; R1 27
-	lea	ebx,[1163531501+ebp*1+ebx]
-	xor	edi,ecx
-	and	edi,eax
-	mov	ebp,DWORD [52+esi]
-	xor	edi,edx
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,20
-	add	ebx,ecx
-	; R1 28
-	lea	eax,[2850285829+ebp*1+eax]
-	xor	edi,ebx
-	and	edi,edx
-	mov	ebp,DWORD [8+esi]
-	xor	edi,ecx
-	add	eax,edi
-	mov	edi,ebx
-	rol	eax,5
-	add	eax,ebx
-	; R1 29
-	lea	edx,[4243563512+ebp*1+edx]
-	xor	edi,eax
-	and	edi,ecx
-	mov	ebp,DWORD [28+esi]
-	xor	edi,ebx
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,9
-	add	edx,eax
-	; R1 30
-	lea	ecx,[1735328473+ebp*1+ecx]
-	xor	edi,edx
-	and	edi,ebx
-	mov	ebp,DWORD [48+esi]
-	xor	edi,eax
-	add	ecx,edi
-	mov	edi,edx
-	rol	ecx,14
-	add	ecx,edx
-	; R1 31
-	lea	ebx,[2368359562+ebp*1+ebx]
-	xor	edi,ecx
-	and	edi,eax
-	mov	ebp,DWORD [20+esi]
-	xor	edi,edx
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,20
-	add	ebx,ecx
-	; 
-	; R2 section
-	; R2 32
-	xor	edi,edx
-	xor	edi,ebx
-	lea	eax,[4294588738+ebp*1+eax]
-	add	eax,edi
-	rol	eax,4
-	mov	ebp,DWORD [32+esi]
-	mov	edi,ebx
-	; R2 33
-	lea	edx,[2272392833+ebp*1+edx]
-	add	eax,ebx
-	xor	edi,ecx
-	xor	edi,eax
-	mov	ebp,DWORD [44+esi]
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,11
-	add	edx,eax
-	; R2 34
-	xor	edi,ebx
-	xor	edi,edx
-	lea	ecx,[1839030562+ebp*1+ecx]
-	add	ecx,edi
-	rol	ecx,16
-	mov	ebp,DWORD [56+esi]
-	mov	edi,edx
-	; R2 35
-	lea	ebx,[4259657740+ebp*1+ebx]
-	add	ecx,edx
-	xor	edi,eax
-	xor	edi,ecx
-	mov	ebp,DWORD [4+esi]
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,23
-	add	ebx,ecx
-	; R2 36
-	xor	edi,edx
-	xor	edi,ebx
-	lea	eax,[2763975236+ebp*1+eax]
-	add	eax,edi
-	rol	eax,4
-	mov	ebp,DWORD [16+esi]
-	mov	edi,ebx
-	; R2 37
-	lea	edx,[1272893353+ebp*1+edx]
-	add	eax,ebx
-	xor	edi,ecx
-	xor	edi,eax
-	mov	ebp,DWORD [28+esi]
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,11
-	add	edx,eax
-	; R2 38
-	xor	edi,ebx
-	xor	edi,edx
-	lea	ecx,[4139469664+ebp*1+ecx]
-	add	ecx,edi
-	rol	ecx,16
-	mov	ebp,DWORD [40+esi]
-	mov	edi,edx
-	; R2 39
-	lea	ebx,[3200236656+ebp*1+ebx]
-	add	ecx,edx
-	xor	edi,eax
-	xor	edi,ecx
-	mov	ebp,DWORD [52+esi]
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,23
-	add	ebx,ecx
-	; R2 40
-	xor	edi,edx
-	xor	edi,ebx
-	lea	eax,[681279174+ebp*1+eax]
-	add	eax,edi
-	rol	eax,4
-	mov	ebp,DWORD [esi]
-	mov	edi,ebx
-	; R2 41
-	lea	edx,[3936430074+ebp*1+edx]
-	add	eax,ebx
-	xor	edi,ecx
-	xor	edi,eax
-	mov	ebp,DWORD [12+esi]
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,11
-	add	edx,eax
-	; R2 42
-	xor	edi,ebx
-	xor	edi,edx
-	lea	ecx,[3572445317+ebp*1+ecx]
-	add	ecx,edi
-	rol	ecx,16
-	mov	ebp,DWORD [24+esi]
-	mov	edi,edx
-	; R2 43
-	lea	ebx,[76029189+ebp*1+ebx]
-	add	ecx,edx
-	xor	edi,eax
-	xor	edi,ecx
-	mov	ebp,DWORD [36+esi]
-	add	ebx,edi
-	mov	edi,ecx
-	rol	ebx,23
-	add	ebx,ecx
-	; R2 44
-	xor	edi,edx
-	xor	edi,ebx
-	lea	eax,[3654602809+ebp*1+eax]
-	add	eax,edi
-	rol	eax,4
-	mov	ebp,DWORD [48+esi]
-	mov	edi,ebx
-	; R2 45
-	lea	edx,[3873151461+ebp*1+edx]
-	add	eax,ebx
-	xor	edi,ecx
-	xor	edi,eax
-	mov	ebp,DWORD [60+esi]
-	add	edx,edi
-	mov	edi,eax
-	rol	edx,11
-	add	edx,eax
-	; R2 46
-	xor	edi,ebx
-	xor	edi,edx
-	lea	ecx,[530742520+ebp*1+ecx]
-	add	ecx,edi
-	rol	ecx,16
-	mov	ebp,DWORD [8+esi]
-	mov	edi,edx
-	; R2 47
-	lea	ebx,[3299628645+ebp*1+ebx]
-	add	ecx,edx
-	xor	edi,eax
-	xor	edi,ecx
-	mov	ebp,DWORD [esi]
-	add	ebx,edi
-	mov	edi,-1
-	rol	ebx,23
-	add	ebx,ecx
-	; 
-	; R3 section
-	; R3 48
-	xor	edi,edx
-	or	edi,ebx
-	lea	eax,[4096336452+ebp*1+eax]
-	xor	edi,ecx
-	mov	ebp,DWORD [28+esi]
-	add	eax,edi
-	mov	edi,-1
-	rol	eax,6
-	xor	edi,ecx
-	add	eax,ebx
-	; R3 49
-	or	edi,eax
-	lea	edx,[1126891415+ebp*1+edx]
-	xor	edi,ebx
-	mov	ebp,DWORD [56+esi]
-	add	edx,edi
-	mov	edi,-1
-	rol	edx,10
-	xor	edi,ebx
-	add	edx,eax
-	; R3 50
-	or	edi,edx
-	lea	ecx,[2878612391+ebp*1+ecx]
-	xor	edi,eax
-	mov	ebp,DWORD [20+esi]
-	add	ecx,edi
-	mov	edi,-1
-	rol	ecx,15
-	xor	edi,eax
-	add	ecx,edx
-	; R3 51
-	or	edi,ecx
-	lea	ebx,[4237533241+ebp*1+ebx]
-	xor	edi,edx
-	mov	ebp,DWORD [48+esi]
-	add	ebx,edi
-	mov	edi,-1
-	rol	ebx,21
-	xor	edi,edx
-	add	ebx,ecx
-	; R3 52
-	or	edi,ebx
-	lea	eax,[1700485571+ebp*1+eax]
-	xor	edi,ecx
-	mov	ebp,DWORD [12+esi]
-	add	eax,edi
-	mov	edi,-1
-	rol	eax,6
-	xor	edi,ecx
-	add	eax,ebx
-	; R3 53
-	or	edi,eax
-	lea	edx,[2399980690+ebp*1+edx]
-	xor	edi,ebx
-	mov	ebp,DWORD [40+esi]
-	add	edx,edi
-	mov	edi,-1
-	rol	edx,10
-	xor	edi,ebx
-	add	edx,eax
-	; R3 54
-	or	edi,edx
-	lea	ecx,[4293915773+ebp*1+ecx]
-	xor	edi,eax
-	mov	ebp,DWORD [4+esi]
-	add	ecx,edi
-	mov	edi,-1
-	rol	ecx,15
-	xor	edi,eax
-	add	ecx,edx
-	; R3 55
-	or	edi,ecx
-	lea	ebx,[2240044497+ebp*1+ebx]
-	xor	edi,edx
-	mov	ebp,DWORD [32+esi]
-	add	ebx,edi
-	mov	edi,-1
-	rol	ebx,21
-	xor	edi,edx
-	add	ebx,ecx
-	; R3 56
-	or	edi,ebx
-	lea	eax,[1873313359+ebp*1+eax]
-	xor	edi,ecx
-	mov	ebp,DWORD [60+esi]
-	add	eax,edi
-	mov	edi,-1
-	rol	eax,6
-	xor	edi,ecx
-	add	eax,ebx
-	; R3 57
-	or	edi,eax
-	lea	edx,[4264355552+ebp*1+edx]
-	xor	edi,ebx
-	mov	ebp,DWORD [24+esi]
-	add	edx,edi
-	mov	edi,-1
-	rol	edx,10
-	xor	edi,ebx
-	add	edx,eax
-	; R3 58
-	or	edi,edx
-	lea	ecx,[2734768916+ebp*1+ecx]
-	xor	edi,eax
-	mov	ebp,DWORD [52+esi]
-	add	ecx,edi
-	mov	edi,-1
-	rol	ecx,15
-	xor	edi,eax
-	add	ecx,edx
-	; R3 59
-	or	edi,ecx
-	lea	ebx,[1309151649+ebp*1+ebx]
-	xor	edi,edx
-	mov	ebp,DWORD [16+esi]
-	add	ebx,edi
-	mov	edi,-1
-	rol	ebx,21
-	xor	edi,edx
-	add	ebx,ecx
-	; R3 60
-	or	edi,ebx
-	lea	eax,[4149444226+ebp*1+eax]
-	xor	edi,ecx
-	mov	ebp,DWORD [44+esi]
-	add	eax,edi
-	mov	edi,-1
-	rol	eax,6
-	xor	edi,ecx
-	add	eax,ebx
-	; R3 61
-	or	edi,eax
-	lea	edx,[3174756917+ebp*1+edx]
-	xor	edi,ebx
-	mov	ebp,DWORD [8+esi]
-	add	edx,edi
-	mov	edi,-1
-	rol	edx,10
-	xor	edi,ebx
-	add	edx,eax
-	; R3 62
-	or	edi,edx
-	lea	ecx,[718787259+ebp*1+ecx]
-	xor	edi,eax
-	mov	ebp,DWORD [36+esi]
-	add	ecx,edi
-	mov	edi,-1
-	rol	ecx,15
-	xor	edi,eax
-	add	ecx,edx
-	; R3 63
-	or	edi,ecx
-	lea	ebx,[3951481745+ebp*1+ebx]
-	xor	edi,edx
-	mov	ebp,DWORD [24+esp]
-	add	ebx,edi
-	add	esi,64
-	rol	ebx,21
-	mov	edi,DWORD [ebp]
-	add	ebx,ecx
-	add	eax,edi
-	mov	edi,DWORD [4+ebp]
-	add	ebx,edi
-	mov	edi,DWORD [8+ebp]
-	add	ecx,edi
-	mov	edi,DWORD [12+ebp]
-	add	edx,edi
-	mov	DWORD [ebp],eax
-	mov	DWORD [4+ebp],ebx
-	mov	edi,DWORD [esp]
-	mov	DWORD [8+ebp],ecx
-	mov	DWORD [12+ebp],edx
-	cmp	edi,esi
-	jae	NEAR L$000start
-	pop	eax
-	pop	ebx
-	pop	ebp
-	pop	edi
-	pop	esi
-	ret
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cbc.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cbc.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cbc.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cbc.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cfb.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cfb.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cfb.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/cfb.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ctr.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ctr.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ctr.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ctr.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm_nohw.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm_nohw.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm_nohw.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/gcm_nohw.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ofb.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ofb.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ofb.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/ofb.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/polyval.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/polyval.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/polyval.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/modes/polyval.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/ctrdrbg.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/ctrdrbg.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/ctrdrbg.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/ctrdrbg.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/internal.h
index 7be2ad3d..b661d123 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/internal.h
@@ -18,92 +18,13 @@
 #include <CCryptoBoringSSL_aes.h>
 #include <CCryptoBoringSSL_ctrdrbg.h>
 
-#include "../../internal.h"
+#include "../../bcm_support.h"
 #include "../modes/internal.h"
 
 #if defined(__cplusplus)
 extern "C" {
 #endif
 
-
-#if defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE)
-#define OPENSSL_RAND_DETERMINISTIC
-#elif defined(OPENSSL_TRUSTY)
-#define OPENSSL_RAND_TRUSTY
-#elif defined(OPENSSL_WINDOWS)
-#define OPENSSL_RAND_WINDOWS
-#elif defined(OPENSSL_LINUX)
-#define OPENSSL_RAND_URANDOM
-#elif defined(OPENSSL_APPLE) && !defined(OPENSSL_MACOS)
-// Unlike macOS, iOS and similar hide away getentropy().
-#define OPENSSL_RAND_IOS
-#else
-// By default if you are integrating BoringSSL we expect you to
-// provide getentropy from the <unistd.h> header file.
-#define OPENSSL_RAND_GETENTROPY
-#endif
-
-// RAND_bytes_with_additional_data samples from the RNG after mixing 32 bytes
-// from |user_additional_data| in.
-void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,
-                                     const uint8_t user_additional_data[32]);
-
-#if defined(BORINGSSL_FIPS)
-
-// We overread from /dev/urandom or RDRAND by a factor of 10 and XOR to whiten.
-#define BORINGSSL_FIPS_OVERREAD 10
-
-// CRYPTO_get_seed_entropy writes |out_entropy_len| bytes of entropy, suitable
-// for seeding a DRBG, to |out_entropy|. It sets |*out_used_cpu| to one if the
-// entropy came directly from the CPU and zero if it came from the OS. It
-// actively obtains entropy from the CPU/OS and so should not be called from
-// within the FIPS module.
-void CRYPTO_get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len,
-                             int *out_used_cpu);
-
-// RAND_load_entropy supplies |entropy_len| bytes of entropy to the module. The
-// |want_additional_input| parameter is true iff the entropy was obtained from
-// a source other than the system, e.g. directly from the CPU.
-void RAND_load_entropy(const uint8_t *entropy, size_t entropy_len,
-                       int want_additional_input);
-
-// RAND_need_entropy is implemented outside of the FIPS module and is called
-// when the module has stopped because it has run out of entropy.
-void RAND_need_entropy(size_t bytes_needed);
-
-#endif  // BORINGSSL_FIPS
-
-// CRYPTO_sysrand fills |len| bytes at |buf| with entropy from the operating
-// system.
-void CRYPTO_sysrand(uint8_t *buf, size_t len);
-
-// CRYPTO_sysrand_for_seed fills |len| bytes at |buf| with entropy from the
-// operating system. It may draw from the |GRND_RANDOM| pool on Android,
-// depending on the vendor's configuration.
-void CRYPTO_sysrand_for_seed(uint8_t *buf, size_t len);
-
-#if defined(OPENSSL_RAND_URANDOM) || defined(OPENSSL_RAND_WINDOWS)
-// CRYPTO_init_sysrand initializes long-lived resources needed to draw entropy
-// from the operating system.
-void CRYPTO_init_sysrand(void);
-#else
-OPENSSL_INLINE void CRYPTO_init_sysrand(void) {}
-#endif  // defined(OPENSSL_RAND_URANDOM) || defined(OPENSSL_RAND_WINDOWS)
-
-#if defined(OPENSSL_RAND_URANDOM)
-// CRYPTO_sysrand_if_available fills |len| bytes at |buf| with entropy from the
-// operating system, or early /dev/urandom data, and returns 1, _if_ the entropy
-// pool is initialized or if getrandom() is not available and not in FIPS mode.
-// Otherwise it will not block and will instead fill |buf| with all zeros and
-// return 0.
-int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len);
-#else
-OPENSSL_INLINE int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) {
-  CRYPTO_sysrand(buf, len);
-  return 1;
-}
-#endif  // defined(OPENSSL_RAND_URANDOM)
-
 // rand_fork_unsafe_buffering_enabled returns whether fork-unsafe buffering has
 // been enabled via |RAND_enable_fork_unsafe_buffering|.
 int rand_fork_unsafe_buffering_enabled(void);
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c.inc
similarity index 92%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c.inc
index 1a8b198f..38ed6857 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/rand.c.inc
@@ -12,8 +12,6 @@
  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 
-#include <CCryptoBoringSSL_rand.h>
-
 #include <assert.h>
 #include <limits.h>
 #include <string.h>
@@ -26,10 +24,10 @@
 #include <CCryptoBoringSSL_ctrdrbg.h>
 #include <CCryptoBoringSSL_mem.h>
 
-#include "internal.h"
-#include "fork_detect.h"
-#include "../../internal.h"
+#include "../../bcm_support.h"
+#include "../bcm_interface.h"
 #include "../delocate.h"
+#include "internal.h"
 
 
 // It's assumed that the operating system always has an unfailing source of
@@ -99,7 +97,7 @@ static void rand_thread_state_clear_all(void) {
     CTR_DRBG_clear(&cur->drbg);
   }
   // The locks are deliberately left locked so that any threads that are still
-  // running will hang if they try to call |RAND_bytes|. It also ensures
+  // running will hang if they try to call |BCM_rand_bytes|. It also ensures
   // |rand_thread_state_free| cannot free any thread state while we've taken the
   // lock.
 }
@@ -164,26 +162,24 @@ static int rdrand(uint8_t *buf, const size_t len) {
 
 #else
 
-static int rdrand(uint8_t *buf, size_t len) {
-  return 0;
-}
+static int rdrand(uint8_t *buf, size_t len) { return 0; }
 
 #endif
 
-#if defined(BORINGSSL_FIPS)
-
-void CRYPTO_get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len,
-                             int *out_want_additional_input) {
-  *out_want_additional_input = 0;
-  if (have_rdrand() && rdrand(out_entropy, out_entropy_len)) {
-    *out_want_additional_input = 1;
-  } else {
-    CRYPTO_sysrand_for_seed(out_entropy, out_entropy_len);
+bcm_status BCM_rand_bytes_hwrng(uint8_t *buf, const size_t len) {
+  if (!have_rdrand()) {
+    return bcm_status_failure;
   }
+  if (rdrand(buf, len)) {
+    return bcm_status_not_approved;
+  }
+  return bcm_status_failure;
 }
 
+#if defined(BORINGSSL_FIPS)
+
 // In passive entropy mode, entropy is supplied from outside of the module via
-// |RAND_load_entropy| and is stored in global instance of the following
+// |BCM_rand_load_entropy| and is stored in global instance of the following
 // structure.
 
 struct entropy_buffer {
@@ -202,8 +198,8 @@ struct entropy_buffer {
 DEFINE_BSS_GET(struct entropy_buffer, entropy_buffer);
 DEFINE_STATIC_MUTEX(entropy_buffer_lock);
 
-void RAND_load_entropy(const uint8_t *entropy, size_t entropy_len,
-                       int want_additional_input) {
+bcm_infallible BCM_rand_load_entropy(const uint8_t *entropy, size_t entropy_len,
+                                     int want_additional_input) {
   struct entropy_buffer *const buffer = entropy_buffer_bss_get();
 
   CRYPTO_MUTEX_lock_write(entropy_buffer_lock_bss_get());
@@ -214,9 +210,9 @@ void RAND_load_entropy(const uint8_t *entropy, size_t entropy_len,
 
   OPENSSL_memcpy(&buffer->bytes[buffer->bytes_valid], entropy, entropy_len);
   buffer->bytes_valid += entropy_len;
-  buffer->want_additional_input |=
-      want_additional_input && (entropy_len != 0);
+  buffer->want_additional_input |= want_additional_input && (entropy_len != 0);
   CRYPTO_MUTEX_unlock_write(entropy_buffer_lock_bss_get());
+  return bcm_infallible_not_approved;
 }
 
 // get_seed_entropy fills |out_entropy_len| bytes of |out_entropy| from the
@@ -330,10 +326,10 @@ static void rand_get_seed(struct rand_thread_state *state,
 
 #endif
 
-void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,
-                                     const uint8_t user_additional_data[32]) {
+bcm_infallible BCM_rand_bytes_with_additional_data(
+    uint8_t *out, size_t out_len, const uint8_t user_additional_data[32]) {
   if (out_len == 0) {
-    return;
+    return bcm_infallible_approved;
   }
 
   const uint64_t fork_generation = CRYPTO_get_fork_generation();
@@ -473,21 +469,11 @@ void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,
 #if defined(BORINGSSL_FIPS)
   CRYPTO_MUTEX_unlock_read(&state->clear_drbg_lock);
 #endif
+  return bcm_infallible_approved;
 }
 
-int RAND_bytes(uint8_t *out, size_t out_len) {
+bcm_infallible BCM_rand_bytes(uint8_t *out, size_t out_len) {
   static const uint8_t kZeroAdditionalData[32] = {0};
-  RAND_bytes_with_additional_data(out, out_len, kZeroAdditionalData);
-  return 1;
-}
-
-int RAND_pseudo_bytes(uint8_t *buf, size_t len) {
-  return RAND_bytes(buf, len);
-}
-
-void RAND_get_system_entropy_for_custom_prng(uint8_t *buf, size_t len) {
-  if (len > 256) {
-    abort();
-  }
-  CRYPTO_sysrand_for_seed(buf, len);
+  BCM_rand_bytes_with_additional_data(out, out_len, kZeroAdditionalData);
+  return bcm_infallible_approved;
 }
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/blinding.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/blinding.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/blinding.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/blinding.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c.inc
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c.inc
index 72544134..1ac8b31b 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/padding.c.inc
@@ -63,11 +63,11 @@
 #include <CCryptoBoringSSL_digest.h>
 #include <CCryptoBoringSSL_err.h>
 #include <CCryptoBoringSSL_mem.h>
-#include <CCryptoBoringSSL_rand.h>
 #include <CCryptoBoringSSL_sha.h>
 
 #include "internal.h"
 #include "../service_indicator/internal.h"
+#include "../bcm_interface.h"
 #include "../../internal.h"
 
 
@@ -369,9 +369,7 @@ int RSA_padding_add_PKCS1_PSS_mgf1(const RSA *rsa, unsigned char *EM,
     if (!salt) {
       goto err;
     }
-    if (!RAND_bytes(salt, sLen)) {
-      goto err;
-    }
+    BCM_rand_bytes(salt, sLen);
   }
   maskedDBLen = emLen - hLen - 1;
   H = EM + maskedDBLen;
@@ -394,7 +392,6 @@ int RSA_padding_add_PKCS1_PSS_mgf1(const RSA *rsa, unsigned char *EM,
   }
 
   p = EM;
-
   // Initial PS XORs with all zeroes which is a NOP so just update
   // pointer. Note from a test above this value is guaranteed to
   // be non-negative.
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c.inc
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c.inc
index fc2d5b2a..eea8988b 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa.c.inc
@@ -480,7 +480,7 @@ static const struct pkcs1_sig_prefix kPKCS1SigPrefixes[] = {
     },
     {
      NID_sha1,
-     SHA_DIGEST_LENGTH,
+     BCM_SHA_DIGEST_LENGTH,
      15,
      {0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03, 0x02, 0x1a, 0x05,
       0x00, 0x04, 0x14},
@@ -758,7 +758,8 @@ int RSA_verify_pss_mgf1(RSA *rsa, const uint8_t *digest, size_t digest_len,
 static int check_mod_inverse(int *out_ok, const BIGNUM *a, const BIGNUM *ainv,
                              const BIGNUM *m, unsigned m_min_bits,
                              BN_CTX *ctx) {
-  if (BN_is_negative(ainv) || BN_cmp(ainv, m) >= 0) {
+  if (BN_is_negative(ainv) ||
+      constant_time_declassify_int(BN_cmp(ainv, m) >= 0)) {
     *out_ok = 0;
     return 1;
   }
@@ -772,7 +773,7 @@ static int check_mod_inverse(int *out_ok, const BIGNUM *a, const BIGNUM *ainv,
             bn_mul_consttime(tmp, a, ainv, ctx) &&
             bn_div_consttime(NULL, tmp, tmp, m, m_min_bits, ctx);
   if (ret) {
-    *out_ok = BN_is_one(tmp);
+    *out_ok = constant_time_declassify_int(BN_is_one(tmp));
   }
   BN_CTX_end(ctx);
   return ret;
@@ -831,8 +832,10 @@ int RSA_check_key(const RSA *key) {
   // bounds, to avoid a DoS vector in |bn_mul_consttime| below. Note that
   // n was bound by |rsa_check_public_key|. This also implicitly checks p and q
   // are odd, which is a necessary condition for Montgomery reduction.
-  if (BN_is_negative(key->p) || BN_cmp(key->p, key->n) >= 0 ||
-      BN_is_negative(key->q) || BN_cmp(key->q, key->n) >= 0) {
+  if (BN_is_negative(key->p) ||
+      constant_time_declassify_int(BN_cmp(key->p, key->n) >= 0) ||
+      BN_is_negative(key->q) ||
+      constant_time_declassify_int(BN_cmp(key->q, key->n) >= 0)) {
     OPENSSL_PUT_ERROR(RSA, RSA_R_N_NOT_EQUAL_P_Q);
     goto out;
   }
@@ -863,7 +866,8 @@ int RSA_check_key(const RSA *key) {
     goto out;
   }
 
-  if (!BN_is_one(&tmp) || !BN_is_one(&de)) {
+  if (constant_time_declassify_int(!BN_is_one(&tmp)) ||
+      constant_time_declassify_int(!BN_is_one(&de))) {
     OPENSSL_PUT_ERROR(RSA, RSA_R_D_E_NOT_CONGRUENT_TO_1);
     goto out;
   }
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c.inc
similarity index 97%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c.inc
index 50dc1f32..c13d979f 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsa/rsa_impl.c.inc
@@ -65,10 +65,10 @@
 #include <CCryptoBoringSSL_mem.h>
 #include <CCryptoBoringSSL_thread.h>
 
+#include "../../bcm_support.h"
 #include "../../internal.h"
 #include "../bn/internal.h"
 #include "../delocate.h"
-#include "../rand/fork_detect.h"
 #include "../service_indicator/internal.h"
 #include "internal.h"
 
@@ -795,7 +795,7 @@ static int mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx) {
 
   // This is a pre-condition for |mod_montgomery|. It was already checked by the
   // caller.
-  assert(BN_ucmp(I, n) < 0);
+  declassify_assert(BN_ucmp(I, n) < 0);
 
   if (// |m1| is the result modulo |q|.
       !mod_montgomery(r1, I, q, rsa->mont_q, p, ctx) ||
@@ -831,7 +831,7 @@ static int mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx) {
   // bound the width slightly higher, so fix it. This trips constant-time checks
   // because a naive data flow analysis does not realize the excess words are
   // publicly zero.
-  assert(BN_cmp(r0, n) < 0);
+  declassify_assert(BN_cmp(r0, n) < 0);
   bn_assert_fits_in_bytes(r0, BN_num_bytes(n));
   if (!bn_resize_words(r0, n->width)) {
     goto err;
@@ -1003,20 +1003,25 @@ static int generate_prime(BIGNUM *out, int bits, const BIGNUM *e,
     // retrying. That is, we reject a negligible fraction of primes that are
     // within the FIPS bound, but we will never accept a prime outside the
     // bound, ensuring the resulting RSA key is the right size.
-    if (BN_cmp(out, sqrt2) <= 0) {
+    //
+    // Values over the threshold are discarded, so it is safe to leak this
+    // comparison.
+    if (constant_time_declassify_int(BN_cmp(out, sqrt2) <= 0)) {
       continue;
     }
 
     // RSA key generation's bottleneck is discarding composites. If it fails
     // trial division, do not bother computing a GCD or performing Miller-Rabin.
     if (!bn_odd_number_is_obviously_composite(out)) {
-      // Check gcd(out-1, e) is one (steps 4.5 and 5.6).
+      // Check gcd(out-1, e) is one (steps 4.5 and 5.6). Leaking the final
+      // result of this comparison is safe because, if not relatively prime, the
+      // value will be discarded.
       int relatively_prime;
-      if (!BN_sub(tmp, out, BN_value_one()) ||
+      if (!bn_usub_consttime(tmp, out, BN_value_one()) ||
           !bn_is_relatively_prime(&relatively_prime, tmp, e, ctx)) {
         goto err;
       }
-      if (relatively_prime) {
+      if (constant_time_declassify_int(relatively_prime)) {
         // Test |out| for primality (steps 4.5.1 and 5.6.1).
         int is_probable_prime;
         if (!BN_primality_test(&is_probable_prime, out,
@@ -1174,8 +1179,9 @@ static int rsa_generate_key_impl(RSA *rsa, int bits, const BIGNUM *e_value,
     }
 
     // Retry if |rsa->d| <= 2^|prime_bits|. See appendix B.3.1's guidance on
-    // values for d.
-  } while (BN_cmp(rsa->d, pow2_prime_bits) <= 0);
+    // values for d. When we retry, p and q are discarded, so it is safe to leak
+    // this comparison.
+  } while (constant_time_declassify_int(BN_cmp(rsa->d, pow2_prime_bits) <= 0));
 
   assert(BN_num_bits(pm1) == (unsigned)prime_bits);
   assert(BN_num_bits(qm1) == (unsigned)prime_bits);
@@ -1189,6 +1195,9 @@ static int rsa_generate_key_impl(RSA *rsa, int bits, const BIGNUM *e_value,
   }
   bn_set_minimal_width(rsa->n);
 
+  // |rsa->n| is computed from the private key, but is public.
+  bn_declassify(rsa->n);
+
   // Sanity-check that |rsa->n| has the specified size. This is implied by
   // |generate_prime|'s bounds.
   if (BN_num_bits(rsa->n) != (unsigned)bits) {
@@ -1241,6 +1250,11 @@ static int RSA_generate_key_ex_maybe_fips(RSA *rsa, int bits,
                                           int check_fips) {
   boringssl_ensure_rsa_self_test();
 
+  if (rsa == NULL) {
+    OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER);
+    return 0;
+  }
+
   RSA *tmp = NULL;
   uint32_t err;
   int ret = 0;
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c.inc
similarity index 94%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c.inc
index b3282863..9bd9e7bd 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/fips.c.inc
@@ -72,7 +72,8 @@ int FIPS_query_algorithm_status(const char *algorithm) {
 #if defined(BORINGSSL_FIPS_COUNTERS)
 
 size_t FIPS_read_counter(enum fips_counter_t counter) {
-  if (counter < 0 || counter > fips_counter_max) {
+  size_t index = (size_t)counter;
+  if (index > fips_counter_max) {
     abort();
   }
 
@@ -82,11 +83,12 @@ size_t FIPS_read_counter(enum fips_counter_t counter) {
     return 0;
   }
 
-  return array[counter];
+  return array[index];
 }
 
 void boringssl_fips_inc_counter(enum fips_counter_t counter) {
-  if (counter < 0 || counter > fips_counter_max) {
+  size_t index = (size_t)counter;
+  if (index > fips_counter_max) {
     abort();
   }
 
@@ -106,7 +108,7 @@ void boringssl_fips_inc_counter(enum fips_counter_t counter) {
     }
   }
 
-  array[counter]++;
+  array[index]++;
 }
 
 #else
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c.inc
similarity index 97%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c.inc
index 26439fc5..8b63d0b8 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/self_check/self_check.c.inc
@@ -33,6 +33,7 @@
 #include <CCryptoBoringSSL_sha.h>
 
 #include "../../internal.h"
+#include "../delocate.h"
 #include "../dh/internal.h"
 #include "../ec/internal.h"
 #include "../ecdsa/internal.h"
@@ -77,28 +78,6 @@ static int set_bignum(BIGNUM **out, const uint8_t *in, size_t len) {
   return *out != NULL;
 }
 
-static int serialize_ecdsa_sig(uint8_t *out, size_t out_len,
-                               const ECDSA_SIG *sig) {
-  if ((out_len & 1) ||  //
-      !BN_bn2bin_padded(out, out_len / 2, sig->r) ||
-      !BN_bn2bin_padded(out + out_len / 2, out_len / 2, sig->s)) {
-    return 0;
-  }
-  return 1;
-}
-
-static ECDSA_SIG *parse_ecdsa_sig(const uint8_t *in, size_t in_len) {
-  ECDSA_SIG *ret = ECDSA_SIG_new();
-  if (!ret || //
-      (in_len & 1) ||
-      BN_bin2bn(in, in_len/2, ret->r) == NULL ||
-      BN_bin2bn(in + in_len/2, in_len/2, ret->s) == NULL) {
-    ECDSA_SIG_free(ret);
-    ret = NULL;
-  }
-  return ret;
-}
-
 static RSA *self_test_rsa_key(void) {
   static const uint8_t kN[] = {
       0xd3, 0x3a, 0x62, 0x9f, 0x07, 0x77, 0xb0, 0x18, 0xf3, 0xff, 0xfe, 0xcc,
@@ -415,7 +394,6 @@ static int boringssl_self_test_ecc(void) {
   EC_POINT *ec_point_in = NULL;
   EC_POINT *ec_point_out = NULL;
   BIGNUM *ec_scalar = NULL;
-  ECDSA_SIG *sig = NULL;
 
   ec_key = self_test_ecdsa_key();
   if (ec_key == NULL) {
@@ -443,13 +421,12 @@ static int boringssl_self_test_ecc(void) {
   uint8_t ecdsa_k[32] = {0};
   ecdsa_k[31] = 42;
 
-  sig = ecdsa_sign_with_nonce_for_known_answer_test(
-      kECDSASignDigest, sizeof(kECDSASignDigest), ec_key, ecdsa_k,
-      sizeof(ecdsa_k));
-
   uint8_t ecdsa_sign_output[64];
-  if (sig == NULL ||
-      !serialize_ecdsa_sig(ecdsa_sign_output, sizeof(ecdsa_sign_output), sig) ||
+  size_t ecdsa_sign_output_len;
+  if (!ecdsa_sign_fixed_with_nonce_for_known_answer_test(
+          kECDSASignDigest, sizeof(kECDSASignDigest), ecdsa_sign_output,
+          &ecdsa_sign_output_len, sizeof(ecdsa_sign_output), ec_key, ecdsa_k,
+          sizeof(ecdsa_k)) ||
       !check_test(kECDSASignSig, ecdsa_sign_output, sizeof(ecdsa_sign_output),
                   "ECDSA-sign signature")) {
     fprintf(stderr, "ECDSA-sign KAT failed.\n");
@@ -470,11 +447,9 @@ static int boringssl_self_test_ecc(void) {
       0x8e, 0x5f, 0x64, 0xc3, 0x7e, 0xa2, 0xcf, 0x05, 0x29,
   };
 
-  ECDSA_SIG_free(sig);
-  sig = parse_ecdsa_sig(kECDSAVerifySig, sizeof(kECDSAVerifySig));
-  if (!sig ||
-      !ecdsa_do_verify_no_self_test(kECDSAVerifyDigest,
-                                    sizeof(kECDSAVerifyDigest), sig, ec_key)) {
+  if (!ecdsa_verify_fixed_no_self_test(
+          kECDSAVerifyDigest, sizeof(kECDSAVerifyDigest), kECDSAVerifySig,
+          sizeof(kECDSAVerifySig), ec_key)) {
     fprintf(stderr, "ECDSA-verify KAT failed.\n");
     goto err;
   }
@@ -532,7 +507,6 @@ static int boringssl_self_test_ecc(void) {
   EC_POINT_free(ec_point_in);
   EC_POINT_free(ec_point_out);
   BN_free(ec_scalar);
-  ECDSA_SIG_free(sig);
 
   return ret;
 }
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/internal.h
index 5b6de8ad..c82eb332 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/internal.h
@@ -28,8 +28,8 @@ void FIPS_service_indicator_update_state(void);
 // stop |FIPS_service_indicator_update_state| from actually updating the service
 // indicator. This is used when a primitive calls a potentially approved
 // primitive to avoid false positives. For example, just because a key
-// generation calls |RAND_bytes| (and thus the approved DRBG) doesn't mean that
-// the key generation operation itself is approved.
+// generation calls |BCM_rand_bytes| (and thus the approved DRBG) doesn't mean
+// that the key generation operation itself is approved.
 //
 // This lock nests: i.e. locking twice is fine so long as each lock is paired
 // with an unlock. If the (64-bit) counter overflows, the process aborts.
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c.inc
similarity index 93%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c.inc
index aff9b54e..71db464f 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/service_indicator/service_indicator.c.inc
@@ -171,7 +171,6 @@ static int is_md_fips_approved_for_signing(int md_type) {
 // type is FIPS approved for verifying, and zero otherwise.
 static int is_md_fips_approved_for_verifying(int md_type) {
   switch (md_type) {
-    case NID_sha1:
     case NID_sha224:
     case NID_sha256:
     case NID_sha384:
@@ -184,7 +183,6 @@ static int is_md_fips_approved_for_verifying(int md_type) {
 }
 
 static void evp_md_ctx_verify_service_indicator(const EVP_MD_CTX *ctx,
-                                                int rsa_1024_ok,
                                                 int (*md_ok)(int md_type)) {
   if (EVP_MD_CTX_md(ctx) == NULL) {
     // Signature schemes without a prehash are currently never FIPS approved.
@@ -232,8 +230,7 @@ static void evp_md_ctx_verify_service_indicator(const EVP_MD_CTX *ctx,
 
     // Check if the MD type and the RSA key size are approved.
     if (md_ok(md_type) &&
-        ((rsa_1024_ok && pkey_size == 128) || pkey_size == 256 ||
-         pkey_size == 384 || pkey_size == 512)) {
+        (pkey_size == 256 || pkey_size == 384 || pkey_size == 512)) {
       FIPS_service_indicator_update_state();
     }
   } else if (pkey_type == EVP_PKEY_EC) {
@@ -251,7 +248,7 @@ static void evp_md_ctx_verify_service_indicator(const EVP_MD_CTX *ctx,
 }
 
 void EC_KEY_keygen_verify_service_indicator(const EC_KEY *eckey) {
-  if (is_ec_fips_approved(EC_GROUP_get_curve_name(eckey->group))) {
+  if (is_ec_fips_approved(EC_GROUP_get_curve_name(EC_KEY_get0_group(eckey)))) {
     FIPS_service_indicator_update_state();
   }
 }
@@ -280,17 +277,17 @@ void EVP_Cipher_verify_service_indicator(const EVP_CIPHER_CTX *ctx) {
 }
 
 void EVP_DigestVerify_verify_service_indicator(const EVP_MD_CTX *ctx) {
-  return evp_md_ctx_verify_service_indicator(ctx, /*rsa_1024_ok=*/1,
+  return evp_md_ctx_verify_service_indicator(ctx,
                                              is_md_fips_approved_for_verifying);
 }
 
 void EVP_DigestSign_verify_service_indicator(const EVP_MD_CTX *ctx) {
-  return evp_md_ctx_verify_service_indicator(ctx, /*rsa_1024_ok=*/0,
+  return evp_md_ctx_verify_service_indicator(ctx,
                                              is_md_fips_approved_for_signing);
 }
 
 void HMAC_verify_service_indicator(const EVP_MD *evp_md) {
-  switch (evp_md->type) {
+  switch (EVP_MD_type(evp_md)) {
     case NID_sha1:
     case NID_sha224:
     case NID_sha256:
@@ -303,12 +300,9 @@ void HMAC_verify_service_indicator(const EVP_MD *evp_md) {
 }
 
 void TLSKDF_verify_service_indicator(const EVP_MD *md) {
-  // HMAC-MD5/HMAC-SHA1 (both used concurrently) is approved for use in the KDF
-  // in TLS 1.0/1.1. HMAC-SHA{256, 384, 512} are approved for use in the KDF in
-  // TLS 1.2. These Key Derivation functions are to be used in the context of
-  // the TLS protocol.
+  // HMAC-SHA{256, 384, 512} are approved for use in the KDF in TLS 1.2. These
+  // Key Derivation functions are to be used in the context of the TLS protocol.
   switch (EVP_MD_type(md)) {
-    case NID_md5_sha1:
     case NID_sha256:
     case NID_sha384:
     case NID_sha512:
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/internal.h b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/internal.h
index 6ac869cb..977c4041 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/internal.h
@@ -80,6 +80,7 @@ OPENSSL_INLINE int sha512_hw_capable(void) {
 
 #define SHA1_ASM_NOHW
 #define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
 
 #define SHA1_ASM_SSSE3
 OPENSSL_INLINE int sha1_ssse3_capable(void) {
@@ -127,10 +128,14 @@ OPENSSL_INLINE int sha256_avx_capable(void) {
 void sha256_block_data_order_avx(uint32_t state[8], const uint8_t *data,
                                  size_t num);
 
-// TODO(crbug.com/boringssl/673): Move the remaining CPU dispatch to C.
-#define SHA512_ASM
-void sha512_block_data_order(uint64_t state[8], const uint8_t *data,
-                             size_t num_blocks);
+#define SHA512_ASM_SSSE3
+OPENSSL_INLINE int sha512_ssse3_capable(void) {
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_SSSE3_capable() && CRYPTO_is_FXSR_capable();
+}
+void sha512_block_data_order_ssse3(uint64_t state[8], const uint8_t *data,
+                                   size_t num);
 
 #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
 
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c.inc
similarity index 96%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c.inc
index 14d9390f..5f14c166 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha1.c.inc
@@ -54,35 +54,25 @@
  * copied and put under another distribution licence
  * [including the GNU Public Licence.] */
 
-#include <CCryptoBoringSSL_sha.h>
-
 #include <string.h>
 
 #include <CCryptoBoringSSL_mem.h>
 
+#include "../bcm_interface.h"
 #include "../../internal.h"
 #include "../digest/md32_common.h"
 #include "../service_indicator/internal.h"
 #include "internal.h"
 
 
-int SHA1_Init(SHA_CTX *sha) {
+bcm_infallible BCM_sha1_init(SHA_CTX *sha) {
   OPENSSL_memset(sha, 0, sizeof(SHA_CTX));
   sha->h[0] = 0x67452301UL;
   sha->h[1] = 0xefcdab89UL;
   sha->h[2] = 0x98badcfeUL;
   sha->h[3] = 0x10325476UL;
   sha->h[4] = 0xc3d2e1f0UL;
-  return 1;
-}
-
-uint8_t *SHA1(const uint8_t *data, size_t len, uint8_t out[SHA_DIGEST_LENGTH]) {
-  SHA_CTX ctx;
-  SHA1_Init(&ctx);
-  SHA1_Update(&ctx, data, len);
-  SHA1_Final(out, &ctx);
-  OPENSSL_cleanse(&ctx, sizeof(ctx));
-  return out;
+  return bcm_infallible_approved;
 }
 
 #if !defined(SHA1_ASM)
@@ -90,14 +80,15 @@ static void sha1_block_data_order(uint32_t state[5], const uint8_t *data,
                                   size_t num);
 #endif
 
-void SHA1_Transform(SHA_CTX *c, const uint8_t data[SHA_CBLOCK]) {
+bcm_infallible BCM_sha1_transform(SHA_CTX *c, const uint8_t data[SHA_CBLOCK]) {
   sha1_block_data_order(c->h, data, 1);
+  return bcm_infallible_approved;
 }
 
-int SHA1_Update(SHA_CTX *c, const void *data, size_t len) {
+bcm_infallible BCM_sha1_update(SHA_CTX *c, const void *data, size_t len) {
   crypto_md32_update(&sha1_block_data_order, c->h, c->data, SHA_CBLOCK, &c->num,
                      &c->Nh, &c->Nl, data, len);
-  return 1;
+  return bcm_infallible_approved;
 }
 
 static void sha1_output_state(uint8_t out[SHA_DIGEST_LENGTH],
@@ -109,16 +100,16 @@ static void sha1_output_state(uint8_t out[SHA_DIGEST_LENGTH],
   CRYPTO_store_u32_be(out + 16, ctx->h[4]);
 }
 
-int SHA1_Final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *c) {
+bcm_infallible BCM_sha1_final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *c) {
   crypto_md32_final(&sha1_block_data_order, c->h, c->data, SHA_CBLOCK, &c->num,
                     c->Nh, c->Nl, /*is_big_endian=*/1);
 
   sha1_output_state(out, c);
   FIPS_service_indicator_update_state();
-  return 1;
+  return bcm_infallible_approved;
 }
 
-void CRYPTO_fips_186_2_prf(uint8_t *out, size_t out_len,
+bcm_infallible BCM_fips_186_2_prf(uint8_t *out, size_t out_len,
                            const uint8_t xkey[SHA_DIGEST_LENGTH]) {
   // XKEY and XVAL are 160-bit values, but are internally right-padded up to
   // block size. See FIPS 186-2, Appendix 3.3. This buffer maintains both the
@@ -130,8 +121,8 @@ void CRYPTO_fips_186_2_prf(uint8_t *out, size_t out_len,
     // We always use a zero XSEED, so we can merge the inner and outer loops.
     // XVAL is also always equal to XKEY.
     SHA_CTX ctx;
-    SHA1_Init(&ctx);
-    SHA1_Transform(&ctx, block);
+    BCM_sha1_init(&ctx);
+    BCM_sha1_transform(&ctx, block);
 
     // XKEY = (1 + XKEY + w_i) mod 2^b
     uint32_t carry = 1;
@@ -152,6 +143,7 @@ void CRYPTO_fips_186_2_prf(uint8_t *out, size_t out_len,
     out += SHA_DIGEST_LENGTH;
     out_len -= SHA_DIGEST_LENGTH;
   }
+  return bcm_infallible_not_approved;
 }
 
 #define Xupdate(a, ix, ia, ib, ic, id)    \
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha256.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha256.c.inc
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha256.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha256.c.inc
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c.inc
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c.inc
index 8135457a..ea73199d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha/sha512.c.inc
@@ -516,6 +516,12 @@ static void sha512_block_data_order(uint64_t state[8], const uint8_t *data,
     return;
   }
 #endif
+#if defined(SHA512_ASM_SSSE3)
+  if (sha512_ssse3_capable()) {
+    sha512_block_data_order_ssse3(state, data, num);
+    return;
+  }
+#endif
 #if defined(SHA512_ASM_NEON)
   if (CRYPTO_is_NEON_capable()) {
     sha512_block_data_order_neon(state, data, num);
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-windows.windows.x86.S
deleted file mode 100644
index 3622e531..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-windows.windows.x86.S
+++ /dev/null
@@ -1,3797 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-global	_sha1_block_data_order_nohw
-align	16
-_sha1_block_data_order_nohw:
-L$_sha1_block_data_order_nohw_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	ebp,DWORD [20+esp]
-	mov	esi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	sub	esp,76
-	shl	eax,6
-	add	eax,esi
-	mov	DWORD [104+esp],eax
-	mov	edi,DWORD [16+ebp]
-	jmp	NEAR L$000loop
-align	16
-L$000loop:
-	mov	eax,DWORD [esi]
-	mov	ebx,DWORD [4+esi]
-	mov	ecx,DWORD [8+esi]
-	mov	edx,DWORD [12+esi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	mov	DWORD [8+esp],ecx
-	mov	DWORD [12+esp],edx
-	mov	eax,DWORD [16+esi]
-	mov	ebx,DWORD [20+esi]
-	mov	ecx,DWORD [24+esi]
-	mov	edx,DWORD [28+esi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	mov	DWORD [16+esp],eax
-	mov	DWORD [20+esp],ebx
-	mov	DWORD [24+esp],ecx
-	mov	DWORD [28+esp],edx
-	mov	eax,DWORD [32+esi]
-	mov	ebx,DWORD [36+esi]
-	mov	ecx,DWORD [40+esi]
-	mov	edx,DWORD [44+esi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	mov	DWORD [32+esp],eax
-	mov	DWORD [36+esp],ebx
-	mov	DWORD [40+esp],ecx
-	mov	DWORD [44+esp],edx
-	mov	eax,DWORD [48+esi]
-	mov	ebx,DWORD [52+esi]
-	mov	ecx,DWORD [56+esi]
-	mov	edx,DWORD [60+esi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	mov	DWORD [48+esp],eax
-	mov	DWORD [52+esp],ebx
-	mov	DWORD [56+esp],ecx
-	mov	DWORD [60+esp],edx
-	mov	DWORD [100+esp],esi
-	mov	eax,DWORD [ebp]
-	mov	ebx,DWORD [4+ebp]
-	mov	ecx,DWORD [8+ebp]
-	mov	edx,DWORD [12+ebp]
-	; 00_15 0
-	mov	esi,ecx
-	mov	ebp,eax
-	rol	ebp,5
-	xor	esi,edx
-	add	ebp,edi
-	mov	edi,DWORD [esp]
-	and	esi,ebx
-	ror	ebx,2
-	xor	esi,edx
-	lea	ebp,[1518500249+edi*1+ebp]
-	add	ebp,esi
-	; 00_15 1
-	mov	edi,ebx
-	mov	esi,ebp
-	rol	ebp,5
-	xor	edi,ecx
-	add	ebp,edx
-	mov	edx,DWORD [4+esp]
-	and	edi,eax
-	ror	eax,2
-	xor	edi,ecx
-	lea	ebp,[1518500249+edx*1+ebp]
-	add	ebp,edi
-	; 00_15 2
-	mov	edx,eax
-	mov	edi,ebp
-	rol	ebp,5
-	xor	edx,ebx
-	add	ebp,ecx
-	mov	ecx,DWORD [8+esp]
-	and	edx,esi
-	ror	esi,2
-	xor	edx,ebx
-	lea	ebp,[1518500249+ecx*1+ebp]
-	add	ebp,edx
-	; 00_15 3
-	mov	ecx,esi
-	mov	edx,ebp
-	rol	ebp,5
-	xor	ecx,eax
-	add	ebp,ebx
-	mov	ebx,DWORD [12+esp]
-	and	ecx,edi
-	ror	edi,2
-	xor	ecx,eax
-	lea	ebp,[1518500249+ebx*1+ebp]
-	add	ebp,ecx
-	; 00_15 4
-	mov	ebx,edi
-	mov	ecx,ebp
-	rol	ebp,5
-	xor	ebx,esi
-	add	ebp,eax
-	mov	eax,DWORD [16+esp]
-	and	ebx,edx
-	ror	edx,2
-	xor	ebx,esi
-	lea	ebp,[1518500249+eax*1+ebp]
-	add	ebp,ebx
-	; 00_15 5
-	mov	eax,edx
-	mov	ebx,ebp
-	rol	ebp,5
-	xor	eax,edi
-	add	ebp,esi
-	mov	esi,DWORD [20+esp]
-	and	eax,ecx
-	ror	ecx,2
-	xor	eax,edi
-	lea	ebp,[1518500249+esi*1+ebp]
-	add	ebp,eax
-	; 00_15 6
-	mov	esi,ecx
-	mov	eax,ebp
-	rol	ebp,5
-	xor	esi,edx
-	add	ebp,edi
-	mov	edi,DWORD [24+esp]
-	and	esi,ebx
-	ror	ebx,2
-	xor	esi,edx
-	lea	ebp,[1518500249+edi*1+ebp]
-	add	ebp,esi
-	; 00_15 7
-	mov	edi,ebx
-	mov	esi,ebp
-	rol	ebp,5
-	xor	edi,ecx
-	add	ebp,edx
-	mov	edx,DWORD [28+esp]
-	and	edi,eax
-	ror	eax,2
-	xor	edi,ecx
-	lea	ebp,[1518500249+edx*1+ebp]
-	add	ebp,edi
-	; 00_15 8
-	mov	edx,eax
-	mov	edi,ebp
-	rol	ebp,5
-	xor	edx,ebx
-	add	ebp,ecx
-	mov	ecx,DWORD [32+esp]
-	and	edx,esi
-	ror	esi,2
-	xor	edx,ebx
-	lea	ebp,[1518500249+ecx*1+ebp]
-	add	ebp,edx
-	; 00_15 9
-	mov	ecx,esi
-	mov	edx,ebp
-	rol	ebp,5
-	xor	ecx,eax
-	add	ebp,ebx
-	mov	ebx,DWORD [36+esp]
-	and	ecx,edi
-	ror	edi,2
-	xor	ecx,eax
-	lea	ebp,[1518500249+ebx*1+ebp]
-	add	ebp,ecx
-	; 00_15 10
-	mov	ebx,edi
-	mov	ecx,ebp
-	rol	ebp,5
-	xor	ebx,esi
-	add	ebp,eax
-	mov	eax,DWORD [40+esp]
-	and	ebx,edx
-	ror	edx,2
-	xor	ebx,esi
-	lea	ebp,[1518500249+eax*1+ebp]
-	add	ebp,ebx
-	; 00_15 11
-	mov	eax,edx
-	mov	ebx,ebp
-	rol	ebp,5
-	xor	eax,edi
-	add	ebp,esi
-	mov	esi,DWORD [44+esp]
-	and	eax,ecx
-	ror	ecx,2
-	xor	eax,edi
-	lea	ebp,[1518500249+esi*1+ebp]
-	add	ebp,eax
-	; 00_15 12
-	mov	esi,ecx
-	mov	eax,ebp
-	rol	ebp,5
-	xor	esi,edx
-	add	ebp,edi
-	mov	edi,DWORD [48+esp]
-	and	esi,ebx
-	ror	ebx,2
-	xor	esi,edx
-	lea	ebp,[1518500249+edi*1+ebp]
-	add	ebp,esi
-	; 00_15 13
-	mov	edi,ebx
-	mov	esi,ebp
-	rol	ebp,5
-	xor	edi,ecx
-	add	ebp,edx
-	mov	edx,DWORD [52+esp]
-	and	edi,eax
-	ror	eax,2
-	xor	edi,ecx
-	lea	ebp,[1518500249+edx*1+ebp]
-	add	ebp,edi
-	; 00_15 14
-	mov	edx,eax
-	mov	edi,ebp
-	rol	ebp,5
-	xor	edx,ebx
-	add	ebp,ecx
-	mov	ecx,DWORD [56+esp]
-	and	edx,esi
-	ror	esi,2
-	xor	edx,ebx
-	lea	ebp,[1518500249+ecx*1+ebp]
-	add	ebp,edx
-	; 00_15 15
-	mov	ecx,esi
-	mov	edx,ebp
-	rol	ebp,5
-	xor	ecx,eax
-	add	ebp,ebx
-	mov	ebx,DWORD [60+esp]
-	and	ecx,edi
-	ror	edi,2
-	xor	ecx,eax
-	lea	ebp,[1518500249+ebx*1+ebp]
-	mov	ebx,DWORD [esp]
-	add	ecx,ebp
-	; 16_19 16
-	mov	ebp,edi
-	xor	ebx,DWORD [8+esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [32+esp]
-	and	ebp,edx
-	xor	ebx,DWORD [52+esp]
-	rol	ebx,1
-	xor	ebp,esi
-	add	eax,ebp
-	mov	ebp,ecx
-	ror	edx,2
-	mov	DWORD [esp],ebx
-	rol	ebp,5
-	lea	ebx,[1518500249+eax*1+ebx]
-	mov	eax,DWORD [4+esp]
-	add	ebx,ebp
-	; 16_19 17
-	mov	ebp,edx
-	xor	eax,DWORD [12+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [36+esp]
-	and	ebp,ecx
-	xor	eax,DWORD [56+esp]
-	rol	eax,1
-	xor	ebp,edi
-	add	esi,ebp
-	mov	ebp,ebx
-	ror	ecx,2
-	mov	DWORD [4+esp],eax
-	rol	ebp,5
-	lea	eax,[1518500249+esi*1+eax]
-	mov	esi,DWORD [8+esp]
-	add	eax,ebp
-	; 16_19 18
-	mov	ebp,ecx
-	xor	esi,DWORD [16+esp]
-	xor	ebp,edx
-	xor	esi,DWORD [40+esp]
-	and	ebp,ebx
-	xor	esi,DWORD [60+esp]
-	rol	esi,1
-	xor	ebp,edx
-	add	edi,ebp
-	mov	ebp,eax
-	ror	ebx,2
-	mov	DWORD [8+esp],esi
-	rol	ebp,5
-	lea	esi,[1518500249+edi*1+esi]
-	mov	edi,DWORD [12+esp]
-	add	esi,ebp
-	; 16_19 19
-	mov	ebp,ebx
-	xor	edi,DWORD [20+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [44+esp]
-	and	ebp,eax
-	xor	edi,DWORD [esp]
-	rol	edi,1
-	xor	ebp,ecx
-	add	edx,ebp
-	mov	ebp,esi
-	ror	eax,2
-	mov	DWORD [12+esp],edi
-	rol	ebp,5
-	lea	edi,[1518500249+edx*1+edi]
-	mov	edx,DWORD [16+esp]
-	add	edi,ebp
-	; 20_39 20
-	mov	ebp,esi
-	xor	edx,DWORD [24+esp]
-	xor	ebp,eax
-	xor	edx,DWORD [48+esp]
-	xor	ebp,ebx
-	xor	edx,DWORD [4+esp]
-	rol	edx,1
-	add	ecx,ebp
-	ror	esi,2
-	mov	ebp,edi
-	rol	ebp,5
-	mov	DWORD [16+esp],edx
-	lea	edx,[1859775393+ecx*1+edx]
-	mov	ecx,DWORD [20+esp]
-	add	edx,ebp
-	; 20_39 21
-	mov	ebp,edi
-	xor	ecx,DWORD [28+esp]
-	xor	ebp,esi
-	xor	ecx,DWORD [52+esp]
-	xor	ebp,eax
-	xor	ecx,DWORD [8+esp]
-	rol	ecx,1
-	add	ebx,ebp
-	ror	edi,2
-	mov	ebp,edx
-	rol	ebp,5
-	mov	DWORD [20+esp],ecx
-	lea	ecx,[1859775393+ebx*1+ecx]
-	mov	ebx,DWORD [24+esp]
-	add	ecx,ebp
-	; 20_39 22
-	mov	ebp,edx
-	xor	ebx,DWORD [32+esp]
-	xor	ebp,edi
-	xor	ebx,DWORD [56+esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [12+esp]
-	rol	ebx,1
-	add	eax,ebp
-	ror	edx,2
-	mov	ebp,ecx
-	rol	ebp,5
-	mov	DWORD [24+esp],ebx
-	lea	ebx,[1859775393+eax*1+ebx]
-	mov	eax,DWORD [28+esp]
-	add	ebx,ebp
-	; 20_39 23
-	mov	ebp,ecx
-	xor	eax,DWORD [36+esp]
-	xor	ebp,edx
-	xor	eax,DWORD [60+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [16+esp]
-	rol	eax,1
-	add	esi,ebp
-	ror	ecx,2
-	mov	ebp,ebx
-	rol	ebp,5
-	mov	DWORD [28+esp],eax
-	lea	eax,[1859775393+esi*1+eax]
-	mov	esi,DWORD [32+esp]
-	add	eax,ebp
-	; 20_39 24
-	mov	ebp,ebx
-	xor	esi,DWORD [40+esp]
-	xor	ebp,ecx
-	xor	esi,DWORD [esp]
-	xor	ebp,edx
-	xor	esi,DWORD [20+esp]
-	rol	esi,1
-	add	edi,ebp
-	ror	ebx,2
-	mov	ebp,eax
-	rol	ebp,5
-	mov	DWORD [32+esp],esi
-	lea	esi,[1859775393+edi*1+esi]
-	mov	edi,DWORD [36+esp]
-	add	esi,ebp
-	; 20_39 25
-	mov	ebp,eax
-	xor	edi,DWORD [44+esp]
-	xor	ebp,ebx
-	xor	edi,DWORD [4+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [24+esp]
-	rol	edi,1
-	add	edx,ebp
-	ror	eax,2
-	mov	ebp,esi
-	rol	ebp,5
-	mov	DWORD [36+esp],edi
-	lea	edi,[1859775393+edx*1+edi]
-	mov	edx,DWORD [40+esp]
-	add	edi,ebp
-	; 20_39 26
-	mov	ebp,esi
-	xor	edx,DWORD [48+esp]
-	xor	ebp,eax
-	xor	edx,DWORD [8+esp]
-	xor	ebp,ebx
-	xor	edx,DWORD [28+esp]
-	rol	edx,1
-	add	ecx,ebp
-	ror	esi,2
-	mov	ebp,edi
-	rol	ebp,5
-	mov	DWORD [40+esp],edx
-	lea	edx,[1859775393+ecx*1+edx]
-	mov	ecx,DWORD [44+esp]
-	add	edx,ebp
-	; 20_39 27
-	mov	ebp,edi
-	xor	ecx,DWORD [52+esp]
-	xor	ebp,esi
-	xor	ecx,DWORD [12+esp]
-	xor	ebp,eax
-	xor	ecx,DWORD [32+esp]
-	rol	ecx,1
-	add	ebx,ebp
-	ror	edi,2
-	mov	ebp,edx
-	rol	ebp,5
-	mov	DWORD [44+esp],ecx
-	lea	ecx,[1859775393+ebx*1+ecx]
-	mov	ebx,DWORD [48+esp]
-	add	ecx,ebp
-	; 20_39 28
-	mov	ebp,edx
-	xor	ebx,DWORD [56+esp]
-	xor	ebp,edi
-	xor	ebx,DWORD [16+esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [36+esp]
-	rol	ebx,1
-	add	eax,ebp
-	ror	edx,2
-	mov	ebp,ecx
-	rol	ebp,5
-	mov	DWORD [48+esp],ebx
-	lea	ebx,[1859775393+eax*1+ebx]
-	mov	eax,DWORD [52+esp]
-	add	ebx,ebp
-	; 20_39 29
-	mov	ebp,ecx
-	xor	eax,DWORD [60+esp]
-	xor	ebp,edx
-	xor	eax,DWORD [20+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [40+esp]
-	rol	eax,1
-	add	esi,ebp
-	ror	ecx,2
-	mov	ebp,ebx
-	rol	ebp,5
-	mov	DWORD [52+esp],eax
-	lea	eax,[1859775393+esi*1+eax]
-	mov	esi,DWORD [56+esp]
-	add	eax,ebp
-	; 20_39 30
-	mov	ebp,ebx
-	xor	esi,DWORD [esp]
-	xor	ebp,ecx
-	xor	esi,DWORD [24+esp]
-	xor	ebp,edx
-	xor	esi,DWORD [44+esp]
-	rol	esi,1
-	add	edi,ebp
-	ror	ebx,2
-	mov	ebp,eax
-	rol	ebp,5
-	mov	DWORD [56+esp],esi
-	lea	esi,[1859775393+edi*1+esi]
-	mov	edi,DWORD [60+esp]
-	add	esi,ebp
-	; 20_39 31
-	mov	ebp,eax
-	xor	edi,DWORD [4+esp]
-	xor	ebp,ebx
-	xor	edi,DWORD [28+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [48+esp]
-	rol	edi,1
-	add	edx,ebp
-	ror	eax,2
-	mov	ebp,esi
-	rol	ebp,5
-	mov	DWORD [60+esp],edi
-	lea	edi,[1859775393+edx*1+edi]
-	mov	edx,DWORD [esp]
-	add	edi,ebp
-	; 20_39 32
-	mov	ebp,esi
-	xor	edx,DWORD [8+esp]
-	xor	ebp,eax
-	xor	edx,DWORD [32+esp]
-	xor	ebp,ebx
-	xor	edx,DWORD [52+esp]
-	rol	edx,1
-	add	ecx,ebp
-	ror	esi,2
-	mov	ebp,edi
-	rol	ebp,5
-	mov	DWORD [esp],edx
-	lea	edx,[1859775393+ecx*1+edx]
-	mov	ecx,DWORD [4+esp]
-	add	edx,ebp
-	; 20_39 33
-	mov	ebp,edi
-	xor	ecx,DWORD [12+esp]
-	xor	ebp,esi
-	xor	ecx,DWORD [36+esp]
-	xor	ebp,eax
-	xor	ecx,DWORD [56+esp]
-	rol	ecx,1
-	add	ebx,ebp
-	ror	edi,2
-	mov	ebp,edx
-	rol	ebp,5
-	mov	DWORD [4+esp],ecx
-	lea	ecx,[1859775393+ebx*1+ecx]
-	mov	ebx,DWORD [8+esp]
-	add	ecx,ebp
-	; 20_39 34
-	mov	ebp,edx
-	xor	ebx,DWORD [16+esp]
-	xor	ebp,edi
-	xor	ebx,DWORD [40+esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [60+esp]
-	rol	ebx,1
-	add	eax,ebp
-	ror	edx,2
-	mov	ebp,ecx
-	rol	ebp,5
-	mov	DWORD [8+esp],ebx
-	lea	ebx,[1859775393+eax*1+ebx]
-	mov	eax,DWORD [12+esp]
-	add	ebx,ebp
-	; 20_39 35
-	mov	ebp,ecx
-	xor	eax,DWORD [20+esp]
-	xor	ebp,edx
-	xor	eax,DWORD [44+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [esp]
-	rol	eax,1
-	add	esi,ebp
-	ror	ecx,2
-	mov	ebp,ebx
-	rol	ebp,5
-	mov	DWORD [12+esp],eax
-	lea	eax,[1859775393+esi*1+eax]
-	mov	esi,DWORD [16+esp]
-	add	eax,ebp
-	; 20_39 36
-	mov	ebp,ebx
-	xor	esi,DWORD [24+esp]
-	xor	ebp,ecx
-	xor	esi,DWORD [48+esp]
-	xor	ebp,edx
-	xor	esi,DWORD [4+esp]
-	rol	esi,1
-	add	edi,ebp
-	ror	ebx,2
-	mov	ebp,eax
-	rol	ebp,5
-	mov	DWORD [16+esp],esi
-	lea	esi,[1859775393+edi*1+esi]
-	mov	edi,DWORD [20+esp]
-	add	esi,ebp
-	; 20_39 37
-	mov	ebp,eax
-	xor	edi,DWORD [28+esp]
-	xor	ebp,ebx
-	xor	edi,DWORD [52+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [8+esp]
-	rol	edi,1
-	add	edx,ebp
-	ror	eax,2
-	mov	ebp,esi
-	rol	ebp,5
-	mov	DWORD [20+esp],edi
-	lea	edi,[1859775393+edx*1+edi]
-	mov	edx,DWORD [24+esp]
-	add	edi,ebp
-	; 20_39 38
-	mov	ebp,esi
-	xor	edx,DWORD [32+esp]
-	xor	ebp,eax
-	xor	edx,DWORD [56+esp]
-	xor	ebp,ebx
-	xor	edx,DWORD [12+esp]
-	rol	edx,1
-	add	ecx,ebp
-	ror	esi,2
-	mov	ebp,edi
-	rol	ebp,5
-	mov	DWORD [24+esp],edx
-	lea	edx,[1859775393+ecx*1+edx]
-	mov	ecx,DWORD [28+esp]
-	add	edx,ebp
-	; 20_39 39
-	mov	ebp,edi
-	xor	ecx,DWORD [36+esp]
-	xor	ebp,esi
-	xor	ecx,DWORD [60+esp]
-	xor	ebp,eax
-	xor	ecx,DWORD [16+esp]
-	rol	ecx,1
-	add	ebx,ebp
-	ror	edi,2
-	mov	ebp,edx
-	rol	ebp,5
-	mov	DWORD [28+esp],ecx
-	lea	ecx,[1859775393+ebx*1+ecx]
-	mov	ebx,DWORD [32+esp]
-	add	ecx,ebp
-	; 40_59 40
-	mov	ebp,edi
-	xor	ebx,DWORD [40+esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [esp]
-	and	ebp,edx
-	xor	ebx,DWORD [20+esp]
-	rol	ebx,1
-	add	ebp,eax
-	ror	edx,2
-	mov	eax,ecx
-	rol	eax,5
-	mov	DWORD [32+esp],ebx
-	lea	ebx,[2400959708+ebp*1+ebx]
-	mov	ebp,edi
-	add	ebx,eax
-	and	ebp,esi
-	mov	eax,DWORD [36+esp]
-	add	ebx,ebp
-	; 40_59 41
-	mov	ebp,edx
-	xor	eax,DWORD [44+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [4+esp]
-	and	ebp,ecx
-	xor	eax,DWORD [24+esp]
-	rol	eax,1
-	add	ebp,esi
-	ror	ecx,2
-	mov	esi,ebx
-	rol	esi,5
-	mov	DWORD [36+esp],eax
-	lea	eax,[2400959708+ebp*1+eax]
-	mov	ebp,edx
-	add	eax,esi
-	and	ebp,edi
-	mov	esi,DWORD [40+esp]
-	add	eax,ebp
-	; 40_59 42
-	mov	ebp,ecx
-	xor	esi,DWORD [48+esp]
-	xor	ebp,edx
-	xor	esi,DWORD [8+esp]
-	and	ebp,ebx
-	xor	esi,DWORD [28+esp]
-	rol	esi,1
-	add	ebp,edi
-	ror	ebx,2
-	mov	edi,eax
-	rol	edi,5
-	mov	DWORD [40+esp],esi
-	lea	esi,[2400959708+ebp*1+esi]
-	mov	ebp,ecx
-	add	esi,edi
-	and	ebp,edx
-	mov	edi,DWORD [44+esp]
-	add	esi,ebp
-	; 40_59 43
-	mov	ebp,ebx
-	xor	edi,DWORD [52+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [12+esp]
-	and	ebp,eax
-	xor	edi,DWORD [32+esp]
-	rol	edi,1
-	add	ebp,edx
-	ror	eax,2
-	mov	edx,esi
-	rol	edx,5
-	mov	DWORD [44+esp],edi
-	lea	edi,[2400959708+ebp*1+edi]
-	mov	ebp,ebx
-	add	edi,edx
-	and	ebp,ecx
-	mov	edx,DWORD [48+esp]
-	add	edi,ebp
-	; 40_59 44
-	mov	ebp,eax
-	xor	edx,DWORD [56+esp]
-	xor	ebp,ebx
-	xor	edx,DWORD [16+esp]
-	and	ebp,esi
-	xor	edx,DWORD [36+esp]
-	rol	edx,1
-	add	ebp,ecx
-	ror	esi,2
-	mov	ecx,edi
-	rol	ecx,5
-	mov	DWORD [48+esp],edx
-	lea	edx,[2400959708+ebp*1+edx]
-	mov	ebp,eax
-	add	edx,ecx
-	and	ebp,ebx
-	mov	ecx,DWORD [52+esp]
-	add	edx,ebp
-	; 40_59 45
-	mov	ebp,esi
-	xor	ecx,DWORD [60+esp]
-	xor	ebp,eax
-	xor	ecx,DWORD [20+esp]
-	and	ebp,edi
-	xor	ecx,DWORD [40+esp]
-	rol	ecx,1
-	add	ebp,ebx
-	ror	edi,2
-	mov	ebx,edx
-	rol	ebx,5
-	mov	DWORD [52+esp],ecx
-	lea	ecx,[2400959708+ebp*1+ecx]
-	mov	ebp,esi
-	add	ecx,ebx
-	and	ebp,eax
-	mov	ebx,DWORD [56+esp]
-	add	ecx,ebp
-	; 40_59 46
-	mov	ebp,edi
-	xor	ebx,DWORD [esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [24+esp]
-	and	ebp,edx
-	xor	ebx,DWORD [44+esp]
-	rol	ebx,1
-	add	ebp,eax
-	ror	edx,2
-	mov	eax,ecx
-	rol	eax,5
-	mov	DWORD [56+esp],ebx
-	lea	ebx,[2400959708+ebp*1+ebx]
-	mov	ebp,edi
-	add	ebx,eax
-	and	ebp,esi
-	mov	eax,DWORD [60+esp]
-	add	ebx,ebp
-	; 40_59 47
-	mov	ebp,edx
-	xor	eax,DWORD [4+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [28+esp]
-	and	ebp,ecx
-	xor	eax,DWORD [48+esp]
-	rol	eax,1
-	add	ebp,esi
-	ror	ecx,2
-	mov	esi,ebx
-	rol	esi,5
-	mov	DWORD [60+esp],eax
-	lea	eax,[2400959708+ebp*1+eax]
-	mov	ebp,edx
-	add	eax,esi
-	and	ebp,edi
-	mov	esi,DWORD [esp]
-	add	eax,ebp
-	; 40_59 48
-	mov	ebp,ecx
-	xor	esi,DWORD [8+esp]
-	xor	ebp,edx
-	xor	esi,DWORD [32+esp]
-	and	ebp,ebx
-	xor	esi,DWORD [52+esp]
-	rol	esi,1
-	add	ebp,edi
-	ror	ebx,2
-	mov	edi,eax
-	rol	edi,5
-	mov	DWORD [esp],esi
-	lea	esi,[2400959708+ebp*1+esi]
-	mov	ebp,ecx
-	add	esi,edi
-	and	ebp,edx
-	mov	edi,DWORD [4+esp]
-	add	esi,ebp
-	; 40_59 49
-	mov	ebp,ebx
-	xor	edi,DWORD [12+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [36+esp]
-	and	ebp,eax
-	xor	edi,DWORD [56+esp]
-	rol	edi,1
-	add	ebp,edx
-	ror	eax,2
-	mov	edx,esi
-	rol	edx,5
-	mov	DWORD [4+esp],edi
-	lea	edi,[2400959708+ebp*1+edi]
-	mov	ebp,ebx
-	add	edi,edx
-	and	ebp,ecx
-	mov	edx,DWORD [8+esp]
-	add	edi,ebp
-	; 40_59 50
-	mov	ebp,eax
-	xor	edx,DWORD [16+esp]
-	xor	ebp,ebx
-	xor	edx,DWORD [40+esp]
-	and	ebp,esi
-	xor	edx,DWORD [60+esp]
-	rol	edx,1
-	add	ebp,ecx
-	ror	esi,2
-	mov	ecx,edi
-	rol	ecx,5
-	mov	DWORD [8+esp],edx
-	lea	edx,[2400959708+ebp*1+edx]
-	mov	ebp,eax
-	add	edx,ecx
-	and	ebp,ebx
-	mov	ecx,DWORD [12+esp]
-	add	edx,ebp
-	; 40_59 51
-	mov	ebp,esi
-	xor	ecx,DWORD [20+esp]
-	xor	ebp,eax
-	xor	ecx,DWORD [44+esp]
-	and	ebp,edi
-	xor	ecx,DWORD [esp]
-	rol	ecx,1
-	add	ebp,ebx
-	ror	edi,2
-	mov	ebx,edx
-	rol	ebx,5
-	mov	DWORD [12+esp],ecx
-	lea	ecx,[2400959708+ebp*1+ecx]
-	mov	ebp,esi
-	add	ecx,ebx
-	and	ebp,eax
-	mov	ebx,DWORD [16+esp]
-	add	ecx,ebp
-	; 40_59 52
-	mov	ebp,edi
-	xor	ebx,DWORD [24+esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [48+esp]
-	and	ebp,edx
-	xor	ebx,DWORD [4+esp]
-	rol	ebx,1
-	add	ebp,eax
-	ror	edx,2
-	mov	eax,ecx
-	rol	eax,5
-	mov	DWORD [16+esp],ebx
-	lea	ebx,[2400959708+ebp*1+ebx]
-	mov	ebp,edi
-	add	ebx,eax
-	and	ebp,esi
-	mov	eax,DWORD [20+esp]
-	add	ebx,ebp
-	; 40_59 53
-	mov	ebp,edx
-	xor	eax,DWORD [28+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [52+esp]
-	and	ebp,ecx
-	xor	eax,DWORD [8+esp]
-	rol	eax,1
-	add	ebp,esi
-	ror	ecx,2
-	mov	esi,ebx
-	rol	esi,5
-	mov	DWORD [20+esp],eax
-	lea	eax,[2400959708+ebp*1+eax]
-	mov	ebp,edx
-	add	eax,esi
-	and	ebp,edi
-	mov	esi,DWORD [24+esp]
-	add	eax,ebp
-	; 40_59 54
-	mov	ebp,ecx
-	xor	esi,DWORD [32+esp]
-	xor	ebp,edx
-	xor	esi,DWORD [56+esp]
-	and	ebp,ebx
-	xor	esi,DWORD [12+esp]
-	rol	esi,1
-	add	ebp,edi
-	ror	ebx,2
-	mov	edi,eax
-	rol	edi,5
-	mov	DWORD [24+esp],esi
-	lea	esi,[2400959708+ebp*1+esi]
-	mov	ebp,ecx
-	add	esi,edi
-	and	ebp,edx
-	mov	edi,DWORD [28+esp]
-	add	esi,ebp
-	; 40_59 55
-	mov	ebp,ebx
-	xor	edi,DWORD [36+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [60+esp]
-	and	ebp,eax
-	xor	edi,DWORD [16+esp]
-	rol	edi,1
-	add	ebp,edx
-	ror	eax,2
-	mov	edx,esi
-	rol	edx,5
-	mov	DWORD [28+esp],edi
-	lea	edi,[2400959708+ebp*1+edi]
-	mov	ebp,ebx
-	add	edi,edx
-	and	ebp,ecx
-	mov	edx,DWORD [32+esp]
-	add	edi,ebp
-	; 40_59 56
-	mov	ebp,eax
-	xor	edx,DWORD [40+esp]
-	xor	ebp,ebx
-	xor	edx,DWORD [esp]
-	and	ebp,esi
-	xor	edx,DWORD [20+esp]
-	rol	edx,1
-	add	ebp,ecx
-	ror	esi,2
-	mov	ecx,edi
-	rol	ecx,5
-	mov	DWORD [32+esp],edx
-	lea	edx,[2400959708+ebp*1+edx]
-	mov	ebp,eax
-	add	edx,ecx
-	and	ebp,ebx
-	mov	ecx,DWORD [36+esp]
-	add	edx,ebp
-	; 40_59 57
-	mov	ebp,esi
-	xor	ecx,DWORD [44+esp]
-	xor	ebp,eax
-	xor	ecx,DWORD [4+esp]
-	and	ebp,edi
-	xor	ecx,DWORD [24+esp]
-	rol	ecx,1
-	add	ebp,ebx
-	ror	edi,2
-	mov	ebx,edx
-	rol	ebx,5
-	mov	DWORD [36+esp],ecx
-	lea	ecx,[2400959708+ebp*1+ecx]
-	mov	ebp,esi
-	add	ecx,ebx
-	and	ebp,eax
-	mov	ebx,DWORD [40+esp]
-	add	ecx,ebp
-	; 40_59 58
-	mov	ebp,edi
-	xor	ebx,DWORD [48+esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [8+esp]
-	and	ebp,edx
-	xor	ebx,DWORD [28+esp]
-	rol	ebx,1
-	add	ebp,eax
-	ror	edx,2
-	mov	eax,ecx
-	rol	eax,5
-	mov	DWORD [40+esp],ebx
-	lea	ebx,[2400959708+ebp*1+ebx]
-	mov	ebp,edi
-	add	ebx,eax
-	and	ebp,esi
-	mov	eax,DWORD [44+esp]
-	add	ebx,ebp
-	; 40_59 59
-	mov	ebp,edx
-	xor	eax,DWORD [52+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [12+esp]
-	and	ebp,ecx
-	xor	eax,DWORD [32+esp]
-	rol	eax,1
-	add	ebp,esi
-	ror	ecx,2
-	mov	esi,ebx
-	rol	esi,5
-	mov	DWORD [44+esp],eax
-	lea	eax,[2400959708+ebp*1+eax]
-	mov	ebp,edx
-	add	eax,esi
-	and	ebp,edi
-	mov	esi,DWORD [48+esp]
-	add	eax,ebp
-	; 20_39 60
-	mov	ebp,ebx
-	xor	esi,DWORD [56+esp]
-	xor	ebp,ecx
-	xor	esi,DWORD [16+esp]
-	xor	ebp,edx
-	xor	esi,DWORD [36+esp]
-	rol	esi,1
-	add	edi,ebp
-	ror	ebx,2
-	mov	ebp,eax
-	rol	ebp,5
-	mov	DWORD [48+esp],esi
-	lea	esi,[3395469782+edi*1+esi]
-	mov	edi,DWORD [52+esp]
-	add	esi,ebp
-	; 20_39 61
-	mov	ebp,eax
-	xor	edi,DWORD [60+esp]
-	xor	ebp,ebx
-	xor	edi,DWORD [20+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [40+esp]
-	rol	edi,1
-	add	edx,ebp
-	ror	eax,2
-	mov	ebp,esi
-	rol	ebp,5
-	mov	DWORD [52+esp],edi
-	lea	edi,[3395469782+edx*1+edi]
-	mov	edx,DWORD [56+esp]
-	add	edi,ebp
-	; 20_39 62
-	mov	ebp,esi
-	xor	edx,DWORD [esp]
-	xor	ebp,eax
-	xor	edx,DWORD [24+esp]
-	xor	ebp,ebx
-	xor	edx,DWORD [44+esp]
-	rol	edx,1
-	add	ecx,ebp
-	ror	esi,2
-	mov	ebp,edi
-	rol	ebp,5
-	mov	DWORD [56+esp],edx
-	lea	edx,[3395469782+ecx*1+edx]
-	mov	ecx,DWORD [60+esp]
-	add	edx,ebp
-	; 20_39 63
-	mov	ebp,edi
-	xor	ecx,DWORD [4+esp]
-	xor	ebp,esi
-	xor	ecx,DWORD [28+esp]
-	xor	ebp,eax
-	xor	ecx,DWORD [48+esp]
-	rol	ecx,1
-	add	ebx,ebp
-	ror	edi,2
-	mov	ebp,edx
-	rol	ebp,5
-	mov	DWORD [60+esp],ecx
-	lea	ecx,[3395469782+ebx*1+ecx]
-	mov	ebx,DWORD [esp]
-	add	ecx,ebp
-	; 20_39 64
-	mov	ebp,edx
-	xor	ebx,DWORD [8+esp]
-	xor	ebp,edi
-	xor	ebx,DWORD [32+esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [52+esp]
-	rol	ebx,1
-	add	eax,ebp
-	ror	edx,2
-	mov	ebp,ecx
-	rol	ebp,5
-	mov	DWORD [esp],ebx
-	lea	ebx,[3395469782+eax*1+ebx]
-	mov	eax,DWORD [4+esp]
-	add	ebx,ebp
-	; 20_39 65
-	mov	ebp,ecx
-	xor	eax,DWORD [12+esp]
-	xor	ebp,edx
-	xor	eax,DWORD [36+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [56+esp]
-	rol	eax,1
-	add	esi,ebp
-	ror	ecx,2
-	mov	ebp,ebx
-	rol	ebp,5
-	mov	DWORD [4+esp],eax
-	lea	eax,[3395469782+esi*1+eax]
-	mov	esi,DWORD [8+esp]
-	add	eax,ebp
-	; 20_39 66
-	mov	ebp,ebx
-	xor	esi,DWORD [16+esp]
-	xor	ebp,ecx
-	xor	esi,DWORD [40+esp]
-	xor	ebp,edx
-	xor	esi,DWORD [60+esp]
-	rol	esi,1
-	add	edi,ebp
-	ror	ebx,2
-	mov	ebp,eax
-	rol	ebp,5
-	mov	DWORD [8+esp],esi
-	lea	esi,[3395469782+edi*1+esi]
-	mov	edi,DWORD [12+esp]
-	add	esi,ebp
-	; 20_39 67
-	mov	ebp,eax
-	xor	edi,DWORD [20+esp]
-	xor	ebp,ebx
-	xor	edi,DWORD [44+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [esp]
-	rol	edi,1
-	add	edx,ebp
-	ror	eax,2
-	mov	ebp,esi
-	rol	ebp,5
-	mov	DWORD [12+esp],edi
-	lea	edi,[3395469782+edx*1+edi]
-	mov	edx,DWORD [16+esp]
-	add	edi,ebp
-	; 20_39 68
-	mov	ebp,esi
-	xor	edx,DWORD [24+esp]
-	xor	ebp,eax
-	xor	edx,DWORD [48+esp]
-	xor	ebp,ebx
-	xor	edx,DWORD [4+esp]
-	rol	edx,1
-	add	ecx,ebp
-	ror	esi,2
-	mov	ebp,edi
-	rol	ebp,5
-	mov	DWORD [16+esp],edx
-	lea	edx,[3395469782+ecx*1+edx]
-	mov	ecx,DWORD [20+esp]
-	add	edx,ebp
-	; 20_39 69
-	mov	ebp,edi
-	xor	ecx,DWORD [28+esp]
-	xor	ebp,esi
-	xor	ecx,DWORD [52+esp]
-	xor	ebp,eax
-	xor	ecx,DWORD [8+esp]
-	rol	ecx,1
-	add	ebx,ebp
-	ror	edi,2
-	mov	ebp,edx
-	rol	ebp,5
-	mov	DWORD [20+esp],ecx
-	lea	ecx,[3395469782+ebx*1+ecx]
-	mov	ebx,DWORD [24+esp]
-	add	ecx,ebp
-	; 20_39 70
-	mov	ebp,edx
-	xor	ebx,DWORD [32+esp]
-	xor	ebp,edi
-	xor	ebx,DWORD [56+esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [12+esp]
-	rol	ebx,1
-	add	eax,ebp
-	ror	edx,2
-	mov	ebp,ecx
-	rol	ebp,5
-	mov	DWORD [24+esp],ebx
-	lea	ebx,[3395469782+eax*1+ebx]
-	mov	eax,DWORD [28+esp]
-	add	ebx,ebp
-	; 20_39 71
-	mov	ebp,ecx
-	xor	eax,DWORD [36+esp]
-	xor	ebp,edx
-	xor	eax,DWORD [60+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [16+esp]
-	rol	eax,1
-	add	esi,ebp
-	ror	ecx,2
-	mov	ebp,ebx
-	rol	ebp,5
-	mov	DWORD [28+esp],eax
-	lea	eax,[3395469782+esi*1+eax]
-	mov	esi,DWORD [32+esp]
-	add	eax,ebp
-	; 20_39 72
-	mov	ebp,ebx
-	xor	esi,DWORD [40+esp]
-	xor	ebp,ecx
-	xor	esi,DWORD [esp]
-	xor	ebp,edx
-	xor	esi,DWORD [20+esp]
-	rol	esi,1
-	add	edi,ebp
-	ror	ebx,2
-	mov	ebp,eax
-	rol	ebp,5
-	mov	DWORD [32+esp],esi
-	lea	esi,[3395469782+edi*1+esi]
-	mov	edi,DWORD [36+esp]
-	add	esi,ebp
-	; 20_39 73
-	mov	ebp,eax
-	xor	edi,DWORD [44+esp]
-	xor	ebp,ebx
-	xor	edi,DWORD [4+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [24+esp]
-	rol	edi,1
-	add	edx,ebp
-	ror	eax,2
-	mov	ebp,esi
-	rol	ebp,5
-	mov	DWORD [36+esp],edi
-	lea	edi,[3395469782+edx*1+edi]
-	mov	edx,DWORD [40+esp]
-	add	edi,ebp
-	; 20_39 74
-	mov	ebp,esi
-	xor	edx,DWORD [48+esp]
-	xor	ebp,eax
-	xor	edx,DWORD [8+esp]
-	xor	ebp,ebx
-	xor	edx,DWORD [28+esp]
-	rol	edx,1
-	add	ecx,ebp
-	ror	esi,2
-	mov	ebp,edi
-	rol	ebp,5
-	mov	DWORD [40+esp],edx
-	lea	edx,[3395469782+ecx*1+edx]
-	mov	ecx,DWORD [44+esp]
-	add	edx,ebp
-	; 20_39 75
-	mov	ebp,edi
-	xor	ecx,DWORD [52+esp]
-	xor	ebp,esi
-	xor	ecx,DWORD [12+esp]
-	xor	ebp,eax
-	xor	ecx,DWORD [32+esp]
-	rol	ecx,1
-	add	ebx,ebp
-	ror	edi,2
-	mov	ebp,edx
-	rol	ebp,5
-	mov	DWORD [44+esp],ecx
-	lea	ecx,[3395469782+ebx*1+ecx]
-	mov	ebx,DWORD [48+esp]
-	add	ecx,ebp
-	; 20_39 76
-	mov	ebp,edx
-	xor	ebx,DWORD [56+esp]
-	xor	ebp,edi
-	xor	ebx,DWORD [16+esp]
-	xor	ebp,esi
-	xor	ebx,DWORD [36+esp]
-	rol	ebx,1
-	add	eax,ebp
-	ror	edx,2
-	mov	ebp,ecx
-	rol	ebp,5
-	mov	DWORD [48+esp],ebx
-	lea	ebx,[3395469782+eax*1+ebx]
-	mov	eax,DWORD [52+esp]
-	add	ebx,ebp
-	; 20_39 77
-	mov	ebp,ecx
-	xor	eax,DWORD [60+esp]
-	xor	ebp,edx
-	xor	eax,DWORD [20+esp]
-	xor	ebp,edi
-	xor	eax,DWORD [40+esp]
-	rol	eax,1
-	add	esi,ebp
-	ror	ecx,2
-	mov	ebp,ebx
-	rol	ebp,5
-	lea	eax,[3395469782+esi*1+eax]
-	mov	esi,DWORD [56+esp]
-	add	eax,ebp
-	; 20_39 78
-	mov	ebp,ebx
-	xor	esi,DWORD [esp]
-	xor	ebp,ecx
-	xor	esi,DWORD [24+esp]
-	xor	ebp,edx
-	xor	esi,DWORD [44+esp]
-	rol	esi,1
-	add	edi,ebp
-	ror	ebx,2
-	mov	ebp,eax
-	rol	ebp,5
-	lea	esi,[3395469782+edi*1+esi]
-	mov	edi,DWORD [60+esp]
-	add	esi,ebp
-	; 20_39 79
-	mov	ebp,eax
-	xor	edi,DWORD [4+esp]
-	xor	ebp,ebx
-	xor	edi,DWORD [28+esp]
-	xor	ebp,ecx
-	xor	edi,DWORD [48+esp]
-	rol	edi,1
-	add	edx,ebp
-	ror	eax,2
-	mov	ebp,esi
-	rol	ebp,5
-	lea	edi,[3395469782+edx*1+edi]
-	add	edi,ebp
-	mov	ebp,DWORD [96+esp]
-	mov	edx,DWORD [100+esp]
-	add	edi,DWORD [ebp]
-	add	esi,DWORD [4+ebp]
-	add	eax,DWORD [8+ebp]
-	add	ebx,DWORD [12+ebp]
-	add	ecx,DWORD [16+ebp]
-	mov	DWORD [ebp],edi
-	add	edx,64
-	mov	DWORD [4+ebp],esi
-	cmp	edx,DWORD [104+esp]
-	mov	DWORD [8+ebp],eax
-	mov	edi,ecx
-	mov	DWORD [12+ebp],ebx
-	mov	esi,edx
-	mov	DWORD [16+ebp],ecx
-	jb	NEAR L$000loop
-	add	esp,76
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_sha1_block_data_order_ssse3
-align	16
-_sha1_block_data_order_ssse3:
-L$_sha1_block_data_order_ssse3_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	call	L$001pic_point
-L$001pic_point:
-	pop	ebp
-	lea	ebp,[(L$K_XX_XX-L$001pic_point)+ebp]
-	movdqa	xmm7,[ebp]
-	movdqa	xmm0,[16+ebp]
-	movdqa	xmm1,[32+ebp]
-	movdqa	xmm2,[48+ebp]
-	movdqa	xmm6,[64+ebp]
-	mov	edi,DWORD [20+esp]
-	mov	ebp,DWORD [24+esp]
-	mov	edx,DWORD [28+esp]
-	mov	esi,esp
-	sub	esp,208
-	and	esp,-64
-	movdqa	[112+esp],xmm0
-	movdqa	[128+esp],xmm1
-	movdqa	[144+esp],xmm2
-	shl	edx,6
-	movdqa	[160+esp],xmm7
-	add	edx,ebp
-	movdqa	[176+esp],xmm6
-	add	ebp,64
-	mov	DWORD [192+esp],edi
-	mov	DWORD [196+esp],ebp
-	mov	DWORD [200+esp],edx
-	mov	DWORD [204+esp],esi
-	mov	eax,DWORD [edi]
-	mov	ebx,DWORD [4+edi]
-	mov	ecx,DWORD [8+edi]
-	mov	edx,DWORD [12+edi]
-	mov	edi,DWORD [16+edi]
-	mov	esi,ebx
-	movdqu	xmm0,[ebp-64]
-	movdqu	xmm1,[ebp-48]
-	movdqu	xmm2,[ebp-32]
-	movdqu	xmm3,[ebp-16]
-db	102,15,56,0,198
-db	102,15,56,0,206
-db	102,15,56,0,214
-	movdqa	[96+esp],xmm7
-db	102,15,56,0,222
-	paddd	xmm0,xmm7
-	paddd	xmm1,xmm7
-	paddd	xmm2,xmm7
-	movdqa	[esp],xmm0
-	psubd	xmm0,xmm7
-	movdqa	[16+esp],xmm1
-	psubd	xmm1,xmm7
-	movdqa	[32+esp],xmm2
-	mov	ebp,ecx
-	psubd	xmm2,xmm7
-	xor	ebp,edx
-	pshufd	xmm4,xmm0,238
-	and	esi,ebp
-	jmp	NEAR L$002loop
-align	16
-L$002loop:
-	ror	ebx,2
-	xor	esi,edx
-	mov	ebp,eax
-	punpcklqdq	xmm4,xmm1
-	movdqa	xmm6,xmm3
-	add	edi,DWORD [esp]
-	xor	ebx,ecx
-	paddd	xmm7,xmm3
-	movdqa	[64+esp],xmm0
-	rol	eax,5
-	add	edi,esi
-	psrldq	xmm6,4
-	and	ebp,ebx
-	xor	ebx,ecx
-	pxor	xmm4,xmm0
-	add	edi,eax
-	ror	eax,7
-	pxor	xmm6,xmm2
-	xor	ebp,ecx
-	mov	esi,edi
-	add	edx,DWORD [4+esp]
-	pxor	xmm4,xmm6
-	xor	eax,ebx
-	rol	edi,5
-	movdqa	[48+esp],xmm7
-	add	edx,ebp
-	and	esi,eax
-	movdqa	xmm0,xmm4
-	xor	eax,ebx
-	add	edx,edi
-	ror	edi,7
-	movdqa	xmm6,xmm4
-	xor	esi,ebx
-	pslldq	xmm0,12
-	paddd	xmm4,xmm4
-	mov	ebp,edx
-	add	ecx,DWORD [8+esp]
-	psrld	xmm6,31
-	xor	edi,eax
-	rol	edx,5
-	movdqa	xmm7,xmm0
-	add	ecx,esi
-	and	ebp,edi
-	xor	edi,eax
-	psrld	xmm0,30
-	add	ecx,edx
-	ror	edx,7
-	por	xmm4,xmm6
-	xor	ebp,eax
-	mov	esi,ecx
-	add	ebx,DWORD [12+esp]
-	pslld	xmm7,2
-	xor	edx,edi
-	rol	ecx,5
-	pxor	xmm4,xmm0
-	movdqa	xmm0,[96+esp]
-	add	ebx,ebp
-	and	esi,edx
-	pxor	xmm4,xmm7
-	pshufd	xmm5,xmm1,238
-	xor	edx,edi
-	add	ebx,ecx
-	ror	ecx,7
-	xor	esi,edi
-	mov	ebp,ebx
-	punpcklqdq	xmm5,xmm2
-	movdqa	xmm7,xmm4
-	add	eax,DWORD [16+esp]
-	xor	ecx,edx
-	paddd	xmm0,xmm4
-	movdqa	[80+esp],xmm1
-	rol	ebx,5
-	add	eax,esi
-	psrldq	xmm7,4
-	and	ebp,ecx
-	xor	ecx,edx
-	pxor	xmm5,xmm1
-	add	eax,ebx
-	ror	ebx,7
-	pxor	xmm7,xmm3
-	xor	ebp,edx
-	mov	esi,eax
-	add	edi,DWORD [20+esp]
-	pxor	xmm5,xmm7
-	xor	ebx,ecx
-	rol	eax,5
-	movdqa	[esp],xmm0
-	add	edi,ebp
-	and	esi,ebx
-	movdqa	xmm1,xmm5
-	xor	ebx,ecx
-	add	edi,eax
-	ror	eax,7
-	movdqa	xmm7,xmm5
-	xor	esi,ecx
-	pslldq	xmm1,12
-	paddd	xmm5,xmm5
-	mov	ebp,edi
-	add	edx,DWORD [24+esp]
-	psrld	xmm7,31
-	xor	eax,ebx
-	rol	edi,5
-	movdqa	xmm0,xmm1
-	add	edx,esi
-	and	ebp,eax
-	xor	eax,ebx
-	psrld	xmm1,30
-	add	edx,edi
-	ror	edi,7
-	por	xmm5,xmm7
-	xor	ebp,ebx
-	mov	esi,edx
-	add	ecx,DWORD [28+esp]
-	pslld	xmm0,2
-	xor	edi,eax
-	rol	edx,5
-	pxor	xmm5,xmm1
-	movdqa	xmm1,[112+esp]
-	add	ecx,ebp
-	and	esi,edi
-	pxor	xmm5,xmm0
-	pshufd	xmm6,xmm2,238
-	xor	edi,eax
-	add	ecx,edx
-	ror	edx,7
-	xor	esi,eax
-	mov	ebp,ecx
-	punpcklqdq	xmm6,xmm3
-	movdqa	xmm0,xmm5
-	add	ebx,DWORD [32+esp]
-	xor	edx,edi
-	paddd	xmm1,xmm5
-	movdqa	[96+esp],xmm2
-	rol	ecx,5
-	add	ebx,esi
-	psrldq	xmm0,4
-	and	ebp,edx
-	xor	edx,edi
-	pxor	xmm6,xmm2
-	add	ebx,ecx
-	ror	ecx,7
-	pxor	xmm0,xmm4
-	xor	ebp,edi
-	mov	esi,ebx
-	add	eax,DWORD [36+esp]
-	pxor	xmm6,xmm0
-	xor	ecx,edx
-	rol	ebx,5
-	movdqa	[16+esp],xmm1
-	add	eax,ebp
-	and	esi,ecx
-	movdqa	xmm2,xmm6
-	xor	ecx,edx
-	add	eax,ebx
-	ror	ebx,7
-	movdqa	xmm0,xmm6
-	xor	esi,edx
-	pslldq	xmm2,12
-	paddd	xmm6,xmm6
-	mov	ebp,eax
-	add	edi,DWORD [40+esp]
-	psrld	xmm0,31
-	xor	ebx,ecx
-	rol	eax,5
-	movdqa	xmm1,xmm2
-	add	edi,esi
-	and	ebp,ebx
-	xor	ebx,ecx
-	psrld	xmm2,30
-	add	edi,eax
-	ror	eax,7
-	por	xmm6,xmm0
-	xor	ebp,ecx
-	movdqa	xmm0,[64+esp]
-	mov	esi,edi
-	add	edx,DWORD [44+esp]
-	pslld	xmm1,2
-	xor	eax,ebx
-	rol	edi,5
-	pxor	xmm6,xmm2
-	movdqa	xmm2,[112+esp]
-	add	edx,ebp
-	and	esi,eax
-	pxor	xmm6,xmm1
-	pshufd	xmm7,xmm3,238
-	xor	eax,ebx
-	add	edx,edi
-	ror	edi,7
-	xor	esi,ebx
-	mov	ebp,edx
-	punpcklqdq	xmm7,xmm4
-	movdqa	xmm1,xmm6
-	add	ecx,DWORD [48+esp]
-	xor	edi,eax
-	paddd	xmm2,xmm6
-	movdqa	[64+esp],xmm3
-	rol	edx,5
-	add	ecx,esi
-	psrldq	xmm1,4
-	and	ebp,edi
-	xor	edi,eax
-	pxor	xmm7,xmm3
-	add	ecx,edx
-	ror	edx,7
-	pxor	xmm1,xmm5
-	xor	ebp,eax
-	mov	esi,ecx
-	add	ebx,DWORD [52+esp]
-	pxor	xmm7,xmm1
-	xor	edx,edi
-	rol	ecx,5
-	movdqa	[32+esp],xmm2
-	add	ebx,ebp
-	and	esi,edx
-	movdqa	xmm3,xmm7
-	xor	edx,edi
-	add	ebx,ecx
-	ror	ecx,7
-	movdqa	xmm1,xmm7
-	xor	esi,edi
-	pslldq	xmm3,12
-	paddd	xmm7,xmm7
-	mov	ebp,ebx
-	add	eax,DWORD [56+esp]
-	psrld	xmm1,31
-	xor	ecx,edx
-	rol	ebx,5
-	movdqa	xmm2,xmm3
-	add	eax,esi
-	and	ebp,ecx
-	xor	ecx,edx
-	psrld	xmm3,30
-	add	eax,ebx
-	ror	ebx,7
-	por	xmm7,xmm1
-	xor	ebp,edx
-	movdqa	xmm1,[80+esp]
-	mov	esi,eax
-	add	edi,DWORD [60+esp]
-	pslld	xmm2,2
-	xor	ebx,ecx
-	rol	eax,5
-	pxor	xmm7,xmm3
-	movdqa	xmm3,[112+esp]
-	add	edi,ebp
-	and	esi,ebx
-	pxor	xmm7,xmm2
-	pshufd	xmm2,xmm6,238
-	xor	ebx,ecx
-	add	edi,eax
-	ror	eax,7
-	pxor	xmm0,xmm4
-	punpcklqdq	xmm2,xmm7
-	xor	esi,ecx
-	mov	ebp,edi
-	add	edx,DWORD [esp]
-	pxor	xmm0,xmm1
-	movdqa	[80+esp],xmm4
-	xor	eax,ebx
-	rol	edi,5
-	movdqa	xmm4,xmm3
-	add	edx,esi
-	paddd	xmm3,xmm7
-	and	ebp,eax
-	pxor	xmm0,xmm2
-	xor	eax,ebx
-	add	edx,edi
-	ror	edi,7
-	xor	ebp,ebx
-	movdqa	xmm2,xmm0
-	movdqa	[48+esp],xmm3
-	mov	esi,edx
-	add	ecx,DWORD [4+esp]
-	xor	edi,eax
-	rol	edx,5
-	pslld	xmm0,2
-	add	ecx,ebp
-	and	esi,edi
-	psrld	xmm2,30
-	xor	edi,eax
-	add	ecx,edx
-	ror	edx,7
-	xor	esi,eax
-	mov	ebp,ecx
-	add	ebx,DWORD [8+esp]
-	xor	edx,edi
-	rol	ecx,5
-	por	xmm0,xmm2
-	add	ebx,esi
-	and	ebp,edx
-	movdqa	xmm2,[96+esp]
-	xor	edx,edi
-	add	ebx,ecx
-	add	eax,DWORD [12+esp]
-	xor	ebp,edi
-	mov	esi,ebx
-	pshufd	xmm3,xmm7,238
-	rol	ebx,5
-	add	eax,ebp
-	xor	esi,edx
-	ror	ecx,7
-	add	eax,ebx
-	add	edi,DWORD [16+esp]
-	pxor	xmm1,xmm5
-	punpcklqdq	xmm3,xmm0
-	xor	esi,ecx
-	mov	ebp,eax
-	rol	eax,5
-	pxor	xmm1,xmm2
-	movdqa	[96+esp],xmm5
-	add	edi,esi
-	xor	ebp,ecx
-	movdqa	xmm5,xmm4
-	ror	ebx,7
-	paddd	xmm4,xmm0
-	add	edi,eax
-	pxor	xmm1,xmm3
-	add	edx,DWORD [20+esp]
-	xor	ebp,ebx
-	mov	esi,edi
-	rol	edi,5
-	movdqa	xmm3,xmm1
-	movdqa	[esp],xmm4
-	add	edx,ebp
-	xor	esi,ebx
-	ror	eax,7
-	add	edx,edi
-	pslld	xmm1,2
-	add	ecx,DWORD [24+esp]
-	xor	esi,eax
-	psrld	xmm3,30
-	mov	ebp,edx
-	rol	edx,5
-	add	ecx,esi
-	xor	ebp,eax
-	ror	edi,7
-	add	ecx,edx
-	por	xmm1,xmm3
-	add	ebx,DWORD [28+esp]
-	xor	ebp,edi
-	movdqa	xmm3,[64+esp]
-	mov	esi,ecx
-	rol	ecx,5
-	add	ebx,ebp
-	xor	esi,edi
-	ror	edx,7
-	pshufd	xmm4,xmm0,238
-	add	ebx,ecx
-	add	eax,DWORD [32+esp]
-	pxor	xmm2,xmm6
-	punpcklqdq	xmm4,xmm1
-	xor	esi,edx
-	mov	ebp,ebx
-	rol	ebx,5
-	pxor	xmm2,xmm3
-	movdqa	[64+esp],xmm6
-	add	eax,esi
-	xor	ebp,edx
-	movdqa	xmm6,[128+esp]
-	ror	ecx,7
-	paddd	xmm5,xmm1
-	add	eax,ebx
-	pxor	xmm2,xmm4
-	add	edi,DWORD [36+esp]
-	xor	ebp,ecx
-	mov	esi,eax
-	rol	eax,5
-	movdqa	xmm4,xmm2
-	movdqa	[16+esp],xmm5
-	add	edi,ebp
-	xor	esi,ecx
-	ror	ebx,7
-	add	edi,eax
-	pslld	xmm2,2
-	add	edx,DWORD [40+esp]
-	xor	esi,ebx
-	psrld	xmm4,30
-	mov	ebp,edi
-	rol	edi,5
-	add	edx,esi
-	xor	ebp,ebx
-	ror	eax,7
-	add	edx,edi
-	por	xmm2,xmm4
-	add	ecx,DWORD [44+esp]
-	xor	ebp,eax
-	movdqa	xmm4,[80+esp]
-	mov	esi,edx
-	rol	edx,5
-	add	ecx,ebp
-	xor	esi,eax
-	ror	edi,7
-	pshufd	xmm5,xmm1,238
-	add	ecx,edx
-	add	ebx,DWORD [48+esp]
-	pxor	xmm3,xmm7
-	punpcklqdq	xmm5,xmm2
-	xor	esi,edi
-	mov	ebp,ecx
-	rol	ecx,5
-	pxor	xmm3,xmm4
-	movdqa	[80+esp],xmm7
-	add	ebx,esi
-	xor	ebp,edi
-	movdqa	xmm7,xmm6
-	ror	edx,7
-	paddd	xmm6,xmm2
-	add	ebx,ecx
-	pxor	xmm3,xmm5
-	add	eax,DWORD [52+esp]
-	xor	ebp,edx
-	mov	esi,ebx
-	rol	ebx,5
-	movdqa	xmm5,xmm3
-	movdqa	[32+esp],xmm6
-	add	eax,ebp
-	xor	esi,edx
-	ror	ecx,7
-	add	eax,ebx
-	pslld	xmm3,2
-	add	edi,DWORD [56+esp]
-	xor	esi,ecx
-	psrld	xmm5,30
-	mov	ebp,eax
-	rol	eax,5
-	add	edi,esi
-	xor	ebp,ecx
-	ror	ebx,7
-	add	edi,eax
-	por	xmm3,xmm5
-	add	edx,DWORD [60+esp]
-	xor	ebp,ebx
-	movdqa	xmm5,[96+esp]
-	mov	esi,edi
-	rol	edi,5
-	add	edx,ebp
-	xor	esi,ebx
-	ror	eax,7
-	pshufd	xmm6,xmm2,238
-	add	edx,edi
-	add	ecx,DWORD [esp]
-	pxor	xmm4,xmm0
-	punpcklqdq	xmm6,xmm3
-	xor	esi,eax
-	mov	ebp,edx
-	rol	edx,5
-	pxor	xmm4,xmm5
-	movdqa	[96+esp],xmm0
-	add	ecx,esi
-	xor	ebp,eax
-	movdqa	xmm0,xmm7
-	ror	edi,7
-	paddd	xmm7,xmm3
-	add	ecx,edx
-	pxor	xmm4,xmm6
-	add	ebx,DWORD [4+esp]
-	xor	ebp,edi
-	mov	esi,ecx
-	rol	ecx,5
-	movdqa	xmm6,xmm4
-	movdqa	[48+esp],xmm7
-	add	ebx,ebp
-	xor	esi,edi
-	ror	edx,7
-	add	ebx,ecx
-	pslld	xmm4,2
-	add	eax,DWORD [8+esp]
-	xor	esi,edx
-	psrld	xmm6,30
-	mov	ebp,ebx
-	rol	ebx,5
-	add	eax,esi
-	xor	ebp,edx
-	ror	ecx,7
-	add	eax,ebx
-	por	xmm4,xmm6
-	add	edi,DWORD [12+esp]
-	xor	ebp,ecx
-	movdqa	xmm6,[64+esp]
-	mov	esi,eax
-	rol	eax,5
-	add	edi,ebp
-	xor	esi,ecx
-	ror	ebx,7
-	pshufd	xmm7,xmm3,238
-	add	edi,eax
-	add	edx,DWORD [16+esp]
-	pxor	xmm5,xmm1
-	punpcklqdq	xmm7,xmm4
-	xor	esi,ebx
-	mov	ebp,edi
-	rol	edi,5
-	pxor	xmm5,xmm6
-	movdqa	[64+esp],xmm1
-	add	edx,esi
-	xor	ebp,ebx
-	movdqa	xmm1,xmm0
-	ror	eax,7
-	paddd	xmm0,xmm4
-	add	edx,edi
-	pxor	xmm5,xmm7
-	add	ecx,DWORD [20+esp]
-	xor	ebp,eax
-	mov	esi,edx
-	rol	edx,5
-	movdqa	xmm7,xmm5
-	movdqa	[esp],xmm0
-	add	ecx,ebp
-	xor	esi,eax
-	ror	edi,7
-	add	ecx,edx
-	pslld	xmm5,2
-	add	ebx,DWORD [24+esp]
-	xor	esi,edi
-	psrld	xmm7,30
-	mov	ebp,ecx
-	rol	ecx,5
-	add	ebx,esi
-	xor	ebp,edi
-	ror	edx,7
-	add	ebx,ecx
-	por	xmm5,xmm7
-	add	eax,DWORD [28+esp]
-	movdqa	xmm7,[80+esp]
-	ror	ecx,7
-	mov	esi,ebx
-	xor	ebp,edx
-	rol	ebx,5
-	pshufd	xmm0,xmm4,238
-	add	eax,ebp
-	xor	esi,ecx
-	xor	ecx,edx
-	add	eax,ebx
-	add	edi,DWORD [32+esp]
-	pxor	xmm6,xmm2
-	punpcklqdq	xmm0,xmm5
-	and	esi,ecx
-	xor	ecx,edx
-	ror	ebx,7
-	pxor	xmm6,xmm7
-	movdqa	[80+esp],xmm2
-	mov	ebp,eax
-	xor	esi,ecx
-	rol	eax,5
-	movdqa	xmm2,xmm1
-	add	edi,esi
-	paddd	xmm1,xmm5
-	xor	ebp,ebx
-	pxor	xmm6,xmm0
-	xor	ebx,ecx
-	add	edi,eax
-	add	edx,DWORD [36+esp]
-	and	ebp,ebx
-	movdqa	xmm0,xmm6
-	movdqa	[16+esp],xmm1
-	xor	ebx,ecx
-	ror	eax,7
-	mov	esi,edi
-	xor	ebp,ebx
-	rol	edi,5
-	pslld	xmm6,2
-	add	edx,ebp
-	xor	esi,eax
-	psrld	xmm0,30
-	xor	eax,ebx
-	add	edx,edi
-	add	ecx,DWORD [40+esp]
-	and	esi,eax
-	xor	eax,ebx
-	ror	edi,7
-	por	xmm6,xmm0
-	mov	ebp,edx
-	xor	esi,eax
-	movdqa	xmm0,[96+esp]
-	rol	edx,5
-	add	ecx,esi
-	xor	ebp,edi
-	xor	edi,eax
-	add	ecx,edx
-	pshufd	xmm1,xmm5,238
-	add	ebx,DWORD [44+esp]
-	and	ebp,edi
-	xor	edi,eax
-	ror	edx,7
-	mov	esi,ecx
-	xor	ebp,edi
-	rol	ecx,5
-	add	ebx,ebp
-	xor	esi,edx
-	xor	edx,edi
-	add	ebx,ecx
-	add	eax,DWORD [48+esp]
-	pxor	xmm7,xmm3
-	punpcklqdq	xmm1,xmm6
-	and	esi,edx
-	xor	edx,edi
-	ror	ecx,7
-	pxor	xmm7,xmm0
-	movdqa	[96+esp],xmm3
-	mov	ebp,ebx
-	xor	esi,edx
-	rol	ebx,5
-	movdqa	xmm3,[144+esp]
-	add	eax,esi
-	paddd	xmm2,xmm6
-	xor	ebp,ecx
-	pxor	xmm7,xmm1
-	xor	ecx,edx
-	add	eax,ebx
-	add	edi,DWORD [52+esp]
-	and	ebp,ecx
-	movdqa	xmm1,xmm7
-	movdqa	[32+esp],xmm2
-	xor	ecx,edx
-	ror	ebx,7
-	mov	esi,eax
-	xor	ebp,ecx
-	rol	eax,5
-	pslld	xmm7,2
-	add	edi,ebp
-	xor	esi,ebx
-	psrld	xmm1,30
-	xor	ebx,ecx
-	add	edi,eax
-	add	edx,DWORD [56+esp]
-	and	esi,ebx
-	xor	ebx,ecx
-	ror	eax,7
-	por	xmm7,xmm1
-	mov	ebp,edi
-	xor	esi,ebx
-	movdqa	xmm1,[64+esp]
-	rol	edi,5
-	add	edx,esi
-	xor	ebp,eax
-	xor	eax,ebx
-	add	edx,edi
-	pshufd	xmm2,xmm6,238
-	add	ecx,DWORD [60+esp]
-	and	ebp,eax
-	xor	eax,ebx
-	ror	edi,7
-	mov	esi,edx
-	xor	ebp,eax
-	rol	edx,5
-	add	ecx,ebp
-	xor	esi,edi
-	xor	edi,eax
-	add	ecx,edx
-	add	ebx,DWORD [esp]
-	pxor	xmm0,xmm4
-	punpcklqdq	xmm2,xmm7
-	and	esi,edi
-	xor	edi,eax
-	ror	edx,7
-	pxor	xmm0,xmm1
-	movdqa	[64+esp],xmm4
-	mov	ebp,ecx
-	xor	esi,edi
-	rol	ecx,5
-	movdqa	xmm4,xmm3
-	add	ebx,esi
-	paddd	xmm3,xmm7
-	xor	ebp,edx
-	pxor	xmm0,xmm2
-	xor	edx,edi
-	add	ebx,ecx
-	add	eax,DWORD [4+esp]
-	and	ebp,edx
-	movdqa	xmm2,xmm0
-	movdqa	[48+esp],xmm3
-	xor	edx,edi
-	ror	ecx,7
-	mov	esi,ebx
-	xor	ebp,edx
-	rol	ebx,5
-	pslld	xmm0,2
-	add	eax,ebp
-	xor	esi,ecx
-	psrld	xmm2,30
-	xor	ecx,edx
-	add	eax,ebx
-	add	edi,DWORD [8+esp]
-	and	esi,ecx
-	xor	ecx,edx
-	ror	ebx,7
-	por	xmm0,xmm2
-	mov	ebp,eax
-	xor	esi,ecx
-	movdqa	xmm2,[80+esp]
-	rol	eax,5
-	add	edi,esi
-	xor	ebp,ebx
-	xor	ebx,ecx
-	add	edi,eax
-	pshufd	xmm3,xmm7,238
-	add	edx,DWORD [12+esp]
-	and	ebp,ebx
-	xor	ebx,ecx
-	ror	eax,7
-	mov	esi,edi
-	xor	ebp,ebx
-	rol	edi,5
-	add	edx,ebp
-	xor	esi,eax
-	xor	eax,ebx
-	add	edx,edi
-	add	ecx,DWORD [16+esp]
-	pxor	xmm1,xmm5
-	punpcklqdq	xmm3,xmm0
-	and	esi,eax
-	xor	eax,ebx
-	ror	edi,7
-	pxor	xmm1,xmm2
-	movdqa	[80+esp],xmm5
-	mov	ebp,edx
-	xor	esi,eax
-	rol	edx,5
-	movdqa	xmm5,xmm4
-	add	ecx,esi
-	paddd	xmm4,xmm0
-	xor	ebp,edi
-	pxor	xmm1,xmm3
-	xor	edi,eax
-	add	ecx,edx
-	add	ebx,DWORD [20+esp]
-	and	ebp,edi
-	movdqa	xmm3,xmm1
-	movdqa	[esp],xmm4
-	xor	edi,eax
-	ror	edx,7
-	mov	esi,ecx
-	xor	ebp,edi
-	rol	ecx,5
-	pslld	xmm1,2
-	add	ebx,ebp
-	xor	esi,edx
-	psrld	xmm3,30
-	xor	edx,edi
-	add	ebx,ecx
-	add	eax,DWORD [24+esp]
-	and	esi,edx
-	xor	edx,edi
-	ror	ecx,7
-	por	xmm1,xmm3
-	mov	ebp,ebx
-	xor	esi,edx
-	movdqa	xmm3,[96+esp]
-	rol	ebx,5
-	add	eax,esi
-	xor	ebp,ecx
-	xor	ecx,edx
-	add	eax,ebx
-	pshufd	xmm4,xmm0,238
-	add	edi,DWORD [28+esp]
-	and	ebp,ecx
-	xor	ecx,edx
-	ror	ebx,7
-	mov	esi,eax
-	xor	ebp,ecx
-	rol	eax,5
-	add	edi,ebp
-	xor	esi,ebx
-	xor	ebx,ecx
-	add	edi,eax
-	add	edx,DWORD [32+esp]
-	pxor	xmm2,xmm6
-	punpcklqdq	xmm4,xmm1
-	and	esi,ebx
-	xor	ebx,ecx
-	ror	eax,7
-	pxor	xmm2,xmm3
-	movdqa	[96+esp],xmm6
-	mov	ebp,edi
-	xor	esi,ebx
-	rol	edi,5
-	movdqa	xmm6,xmm5
-	add	edx,esi
-	paddd	xmm5,xmm1
-	xor	ebp,eax
-	pxor	xmm2,xmm4
-	xor	eax,ebx
-	add	edx,edi
-	add	ecx,DWORD [36+esp]
-	and	ebp,eax
-	movdqa	xmm4,xmm2
-	movdqa	[16+esp],xmm5
-	xor	eax,ebx
-	ror	edi,7
-	mov	esi,edx
-	xor	ebp,eax
-	rol	edx,5
-	pslld	xmm2,2
-	add	ecx,ebp
-	xor	esi,edi
-	psrld	xmm4,30
-	xor	edi,eax
-	add	ecx,edx
-	add	ebx,DWORD [40+esp]
-	and	esi,edi
-	xor	edi,eax
-	ror	edx,7
-	por	xmm2,xmm4
-	mov	ebp,ecx
-	xor	esi,edi
-	movdqa	xmm4,[64+esp]
-	rol	ecx,5
-	add	ebx,esi
-	xor	ebp,edx
-	xor	edx,edi
-	add	ebx,ecx
-	pshufd	xmm5,xmm1,238
-	add	eax,DWORD [44+esp]
-	and	ebp,edx
-	xor	edx,edi
-	ror	ecx,7
-	mov	esi,ebx
-	xor	ebp,edx
-	rol	ebx,5
-	add	eax,ebp
-	xor	esi,edx
-	add	eax,ebx
-	add	edi,DWORD [48+esp]
-	pxor	xmm3,xmm7
-	punpcklqdq	xmm5,xmm2
-	xor	esi,ecx
-	mov	ebp,eax
-	rol	eax,5
-	pxor	xmm3,xmm4
-	movdqa	[64+esp],xmm7
-	add	edi,esi
-	xor	ebp,ecx
-	movdqa	xmm7,xmm6
-	ror	ebx,7
-	paddd	xmm6,xmm2
-	add	edi,eax
-	pxor	xmm3,xmm5
-	add	edx,DWORD [52+esp]
-	xor	ebp,ebx
-	mov	esi,edi
-	rol	edi,5
-	movdqa	xmm5,xmm3
-	movdqa	[32+esp],xmm6
-	add	edx,ebp
-	xor	esi,ebx
-	ror	eax,7
-	add	edx,edi
-	pslld	xmm3,2
-	add	ecx,DWORD [56+esp]
-	xor	esi,eax
-	psrld	xmm5,30
-	mov	ebp,edx
-	rol	edx,5
-	add	ecx,esi
-	xor	ebp,eax
-	ror	edi,7
-	add	ecx,edx
-	por	xmm3,xmm5
-	add	ebx,DWORD [60+esp]
-	xor	ebp,edi
-	mov	esi,ecx
-	rol	ecx,5
-	add	ebx,ebp
-	xor	esi,edi
-	ror	edx,7
-	add	ebx,ecx
-	add	eax,DWORD [esp]
-	xor	esi,edx
-	mov	ebp,ebx
-	rol	ebx,5
-	add	eax,esi
-	xor	ebp,edx
-	ror	ecx,7
-	paddd	xmm7,xmm3
-	add	eax,ebx
-	add	edi,DWORD [4+esp]
-	xor	ebp,ecx
-	mov	esi,eax
-	movdqa	[48+esp],xmm7
-	rol	eax,5
-	add	edi,ebp
-	xor	esi,ecx
-	ror	ebx,7
-	add	edi,eax
-	add	edx,DWORD [8+esp]
-	xor	esi,ebx
-	mov	ebp,edi
-	rol	edi,5
-	add	edx,esi
-	xor	ebp,ebx
-	ror	eax,7
-	add	edx,edi
-	add	ecx,DWORD [12+esp]
-	xor	ebp,eax
-	mov	esi,edx
-	rol	edx,5
-	add	ecx,ebp
-	xor	esi,eax
-	ror	edi,7
-	add	ecx,edx
-	mov	ebp,DWORD [196+esp]
-	cmp	ebp,DWORD [200+esp]
-	je	NEAR L$003done
-	movdqa	xmm7,[160+esp]
-	movdqa	xmm6,[176+esp]
-	movdqu	xmm0,[ebp]
-	movdqu	xmm1,[16+ebp]
-	movdqu	xmm2,[32+ebp]
-	movdqu	xmm3,[48+ebp]
-	add	ebp,64
-db	102,15,56,0,198
-	mov	DWORD [196+esp],ebp
-	movdqa	[96+esp],xmm7
-	add	ebx,DWORD [16+esp]
-	xor	esi,edi
-	mov	ebp,ecx
-	rol	ecx,5
-	add	ebx,esi
-	xor	ebp,edi
-	ror	edx,7
-db	102,15,56,0,206
-	add	ebx,ecx
-	add	eax,DWORD [20+esp]
-	xor	ebp,edx
-	mov	esi,ebx
-	paddd	xmm0,xmm7
-	rol	ebx,5
-	add	eax,ebp
-	xor	esi,edx
-	ror	ecx,7
-	movdqa	[esp],xmm0
-	add	eax,ebx
-	add	edi,DWORD [24+esp]
-	xor	esi,ecx
-	mov	ebp,eax
-	psubd	xmm0,xmm7
-	rol	eax,5
-	add	edi,esi
-	xor	ebp,ecx
-	ror	ebx,7
-	add	edi,eax
-	add	edx,DWORD [28+esp]
-	xor	ebp,ebx
-	mov	esi,edi
-	rol	edi,5
-	add	edx,ebp
-	xor	esi,ebx
-	ror	eax,7
-	add	edx,edi
-	add	ecx,DWORD [32+esp]
-	xor	esi,eax
-	mov	ebp,edx
-	rol	edx,5
-	add	ecx,esi
-	xor	ebp,eax
-	ror	edi,7
-db	102,15,56,0,214
-	add	ecx,edx
-	add	ebx,DWORD [36+esp]
-	xor	ebp,edi
-	mov	esi,ecx
-	paddd	xmm1,xmm7
-	rol	ecx,5
-	add	ebx,ebp
-	xor	esi,edi
-	ror	edx,7
-	movdqa	[16+esp],xmm1
-	add	ebx,ecx
-	add	eax,DWORD [40+esp]
-	xor	esi,edx
-	mov	ebp,ebx
-	psubd	xmm1,xmm7
-	rol	ebx,5
-	add	eax,esi
-	xor	ebp,edx
-	ror	ecx,7
-	add	eax,ebx
-	add	edi,DWORD [44+esp]
-	xor	ebp,ecx
-	mov	esi,eax
-	rol	eax,5
-	add	edi,ebp
-	xor	esi,ecx
-	ror	ebx,7
-	add	edi,eax
-	add	edx,DWORD [48+esp]
-	xor	esi,ebx
-	mov	ebp,edi
-	rol	edi,5
-	add	edx,esi
-	xor	ebp,ebx
-	ror	eax,7
-db	102,15,56,0,222
-	add	edx,edi
-	add	ecx,DWORD [52+esp]
-	xor	ebp,eax
-	mov	esi,edx
-	paddd	xmm2,xmm7
-	rol	edx,5
-	add	ecx,ebp
-	xor	esi,eax
-	ror	edi,7
-	movdqa	[32+esp],xmm2
-	add	ecx,edx
-	add	ebx,DWORD [56+esp]
-	xor	esi,edi
-	mov	ebp,ecx
-	psubd	xmm2,xmm7
-	rol	ecx,5
-	add	ebx,esi
-	xor	ebp,edi
-	ror	edx,7
-	add	ebx,ecx
-	add	eax,DWORD [60+esp]
-	xor	ebp,edx
-	mov	esi,ebx
-	rol	ebx,5
-	add	eax,ebp
-	ror	ecx,7
-	add	eax,ebx
-	mov	ebp,DWORD [192+esp]
-	add	eax,DWORD [ebp]
-	add	esi,DWORD [4+ebp]
-	add	ecx,DWORD [8+ebp]
-	mov	DWORD [ebp],eax
-	add	edx,DWORD [12+ebp]
-	mov	DWORD [4+ebp],esi
-	add	edi,DWORD [16+ebp]
-	mov	DWORD [8+ebp],ecx
-	mov	ebx,ecx
-	mov	DWORD [12+ebp],edx
-	xor	ebx,edx
-	mov	DWORD [16+ebp],edi
-	mov	ebp,esi
-	pshufd	xmm4,xmm0,238
-	and	esi,ebx
-	mov	ebx,ebp
-	jmp	NEAR L$002loop
-align	16
-L$003done:
-	add	ebx,DWORD [16+esp]
-	xor	esi,edi
-	mov	ebp,ecx
-	rol	ecx,5
-	add	ebx,esi
-	xor	ebp,edi
-	ror	edx,7
-	add	ebx,ecx
-	add	eax,DWORD [20+esp]
-	xor	ebp,edx
-	mov	esi,ebx
-	rol	ebx,5
-	add	eax,ebp
-	xor	esi,edx
-	ror	ecx,7
-	add	eax,ebx
-	add	edi,DWORD [24+esp]
-	xor	esi,ecx
-	mov	ebp,eax
-	rol	eax,5
-	add	edi,esi
-	xor	ebp,ecx
-	ror	ebx,7
-	add	edi,eax
-	add	edx,DWORD [28+esp]
-	xor	ebp,ebx
-	mov	esi,edi
-	rol	edi,5
-	add	edx,ebp
-	xor	esi,ebx
-	ror	eax,7
-	add	edx,edi
-	add	ecx,DWORD [32+esp]
-	xor	esi,eax
-	mov	ebp,edx
-	rol	edx,5
-	add	ecx,esi
-	xor	ebp,eax
-	ror	edi,7
-	add	ecx,edx
-	add	ebx,DWORD [36+esp]
-	xor	ebp,edi
-	mov	esi,ecx
-	rol	ecx,5
-	add	ebx,ebp
-	xor	esi,edi
-	ror	edx,7
-	add	ebx,ecx
-	add	eax,DWORD [40+esp]
-	xor	esi,edx
-	mov	ebp,ebx
-	rol	ebx,5
-	add	eax,esi
-	xor	ebp,edx
-	ror	ecx,7
-	add	eax,ebx
-	add	edi,DWORD [44+esp]
-	xor	ebp,ecx
-	mov	esi,eax
-	rol	eax,5
-	add	edi,ebp
-	xor	esi,ecx
-	ror	ebx,7
-	add	edi,eax
-	add	edx,DWORD [48+esp]
-	xor	esi,ebx
-	mov	ebp,edi
-	rol	edi,5
-	add	edx,esi
-	xor	ebp,ebx
-	ror	eax,7
-	add	edx,edi
-	add	ecx,DWORD [52+esp]
-	xor	ebp,eax
-	mov	esi,edx
-	rol	edx,5
-	add	ecx,ebp
-	xor	esi,eax
-	ror	edi,7
-	add	ecx,edx
-	add	ebx,DWORD [56+esp]
-	xor	esi,edi
-	mov	ebp,ecx
-	rol	ecx,5
-	add	ebx,esi
-	xor	ebp,edi
-	ror	edx,7
-	add	ebx,ecx
-	add	eax,DWORD [60+esp]
-	xor	ebp,edx
-	mov	esi,ebx
-	rol	ebx,5
-	add	eax,ebp
-	ror	ecx,7
-	add	eax,ebx
-	mov	ebp,DWORD [192+esp]
-	add	eax,DWORD [ebp]
-	mov	esp,DWORD [204+esp]
-	add	esi,DWORD [4+ebp]
-	add	ecx,DWORD [8+ebp]
-	mov	DWORD [ebp],eax
-	add	edx,DWORD [12+ebp]
-	mov	DWORD [4+ebp],esi
-	add	edi,DWORD [16+ebp]
-	mov	DWORD [8+ebp],ecx
-	mov	DWORD [12+ebp],edx
-	mov	DWORD [16+ebp],edi
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_sha1_block_data_order_avx
-align	16
-_sha1_block_data_order_avx:
-L$_sha1_block_data_order_avx_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	call	L$004pic_point
-L$004pic_point:
-	pop	ebp
-	lea	ebp,[(L$K_XX_XX-L$004pic_point)+ebp]
-	vzeroall
-	vmovdqa	xmm7,[ebp]
-	vmovdqa	xmm0,[16+ebp]
-	vmovdqa	xmm1,[32+ebp]
-	vmovdqa	xmm2,[48+ebp]
-	vmovdqa	xmm6,[64+ebp]
-	mov	edi,DWORD [20+esp]
-	mov	ebp,DWORD [24+esp]
-	mov	edx,DWORD [28+esp]
-	mov	esi,esp
-	sub	esp,208
-	and	esp,-64
-	vmovdqa	[112+esp],xmm0
-	vmovdqa	[128+esp],xmm1
-	vmovdqa	[144+esp],xmm2
-	shl	edx,6
-	vmovdqa	[160+esp],xmm7
-	add	edx,ebp
-	vmovdqa	[176+esp],xmm6
-	add	ebp,64
-	mov	DWORD [192+esp],edi
-	mov	DWORD [196+esp],ebp
-	mov	DWORD [200+esp],edx
-	mov	DWORD [204+esp],esi
-	mov	eax,DWORD [edi]
-	mov	ebx,DWORD [4+edi]
-	mov	ecx,DWORD [8+edi]
-	mov	edx,DWORD [12+edi]
-	mov	edi,DWORD [16+edi]
-	mov	esi,ebx
-	vmovdqu	xmm0,[ebp-64]
-	vmovdqu	xmm1,[ebp-48]
-	vmovdqu	xmm2,[ebp-32]
-	vmovdqu	xmm3,[ebp-16]
-	vpshufb	xmm0,xmm0,xmm6
-	vpshufb	xmm1,xmm1,xmm6
-	vpshufb	xmm2,xmm2,xmm6
-	vmovdqa	[96+esp],xmm7
-	vpshufb	xmm3,xmm3,xmm6
-	vpaddd	xmm4,xmm0,xmm7
-	vpaddd	xmm5,xmm1,xmm7
-	vpaddd	xmm6,xmm2,xmm7
-	vmovdqa	[esp],xmm4
-	mov	ebp,ecx
-	vmovdqa	[16+esp],xmm5
-	xor	ebp,edx
-	vmovdqa	[32+esp],xmm6
-	and	esi,ebp
-	jmp	NEAR L$005loop
-align	16
-L$005loop:
-	shrd	ebx,ebx,2
-	xor	esi,edx
-	vpalignr	xmm4,xmm1,xmm0,8
-	mov	ebp,eax
-	add	edi,DWORD [esp]
-	vpaddd	xmm7,xmm7,xmm3
-	vmovdqa	[64+esp],xmm0
-	xor	ebx,ecx
-	shld	eax,eax,5
-	vpsrldq	xmm6,xmm3,4
-	add	edi,esi
-	and	ebp,ebx
-	vpxor	xmm4,xmm4,xmm0
-	xor	ebx,ecx
-	add	edi,eax
-	vpxor	xmm6,xmm6,xmm2
-	shrd	eax,eax,7
-	xor	ebp,ecx
-	vmovdqa	[48+esp],xmm7
-	mov	esi,edi
-	add	edx,DWORD [4+esp]
-	vpxor	xmm4,xmm4,xmm6
-	xor	eax,ebx
-	shld	edi,edi,5
-	add	edx,ebp
-	and	esi,eax
-	vpsrld	xmm6,xmm4,31
-	xor	eax,ebx
-	add	edx,edi
-	shrd	edi,edi,7
-	xor	esi,ebx
-	vpslldq	xmm0,xmm4,12
-	vpaddd	xmm4,xmm4,xmm4
-	mov	ebp,edx
-	add	ecx,DWORD [8+esp]
-	xor	edi,eax
-	shld	edx,edx,5
-	vpsrld	xmm7,xmm0,30
-	vpor	xmm4,xmm4,xmm6
-	add	ecx,esi
-	and	ebp,edi
-	xor	edi,eax
-	add	ecx,edx
-	vpslld	xmm0,xmm0,2
-	shrd	edx,edx,7
-	xor	ebp,eax
-	vpxor	xmm4,xmm4,xmm7
-	mov	esi,ecx
-	add	ebx,DWORD [12+esp]
-	xor	edx,edi
-	shld	ecx,ecx,5
-	vpxor	xmm4,xmm4,xmm0
-	add	ebx,ebp
-	and	esi,edx
-	vmovdqa	xmm0,[96+esp]
-	xor	edx,edi
-	add	ebx,ecx
-	shrd	ecx,ecx,7
-	xor	esi,edi
-	vpalignr	xmm5,xmm2,xmm1,8
-	mov	ebp,ebx
-	add	eax,DWORD [16+esp]
-	vpaddd	xmm0,xmm0,xmm4
-	vmovdqa	[80+esp],xmm1
-	xor	ecx,edx
-	shld	ebx,ebx,5
-	vpsrldq	xmm7,xmm4,4
-	add	eax,esi
-	and	ebp,ecx
-	vpxor	xmm5,xmm5,xmm1
-	xor	ecx,edx
-	add	eax,ebx
-	vpxor	xmm7,xmm7,xmm3
-	shrd	ebx,ebx,7
-	xor	ebp,edx
-	vmovdqa	[esp],xmm0
-	mov	esi,eax
-	add	edi,DWORD [20+esp]
-	vpxor	xmm5,xmm5,xmm7
-	xor	ebx,ecx
-	shld	eax,eax,5
-	add	edi,ebp
-	and	esi,ebx
-	vpsrld	xmm7,xmm5,31
-	xor	ebx,ecx
-	add	edi,eax
-	shrd	eax,eax,7
-	xor	esi,ecx
-	vpslldq	xmm1,xmm5,12
-	vpaddd	xmm5,xmm5,xmm5
-	mov	ebp,edi
-	add	edx,DWORD [24+esp]
-	xor	eax,ebx
-	shld	edi,edi,5
-	vpsrld	xmm0,xmm1,30
-	vpor	xmm5,xmm5,xmm7
-	add	edx,esi
-	and	ebp,eax
-	xor	eax,ebx
-	add	edx,edi
-	vpslld	xmm1,xmm1,2
-	shrd	edi,edi,7
-	xor	ebp,ebx
-	vpxor	xmm5,xmm5,xmm0
-	mov	esi,edx
-	add	ecx,DWORD [28+esp]
-	xor	edi,eax
-	shld	edx,edx,5
-	vpxor	xmm5,xmm5,xmm1
-	add	ecx,ebp
-	and	esi,edi
-	vmovdqa	xmm1,[112+esp]
-	xor	edi,eax
-	add	ecx,edx
-	shrd	edx,edx,7
-	xor	esi,eax
-	vpalignr	xmm6,xmm3,xmm2,8
-	mov	ebp,ecx
-	add	ebx,DWORD [32+esp]
-	vpaddd	xmm1,xmm1,xmm5
-	vmovdqa	[96+esp],xmm2
-	xor	edx,edi
-	shld	ecx,ecx,5
-	vpsrldq	xmm0,xmm5,4
-	add	ebx,esi
-	and	ebp,edx
-	vpxor	xmm6,xmm6,xmm2
-	xor	edx,edi
-	add	ebx,ecx
-	vpxor	xmm0,xmm0,xmm4
-	shrd	ecx,ecx,7
-	xor	ebp,edi
-	vmovdqa	[16+esp],xmm1
-	mov	esi,ebx
-	add	eax,DWORD [36+esp]
-	vpxor	xmm6,xmm6,xmm0
-	xor	ecx,edx
-	shld	ebx,ebx,5
-	add	eax,ebp
-	and	esi,ecx
-	vpsrld	xmm0,xmm6,31
-	xor	ecx,edx
-	add	eax,ebx
-	shrd	ebx,ebx,7
-	xor	esi,edx
-	vpslldq	xmm2,xmm6,12
-	vpaddd	xmm6,xmm6,xmm6
-	mov	ebp,eax
-	add	edi,DWORD [40+esp]
-	xor	ebx,ecx
-	shld	eax,eax,5
-	vpsrld	xmm1,xmm2,30
-	vpor	xmm6,xmm6,xmm0
-	add	edi,esi
-	and	ebp,ebx
-	xor	ebx,ecx
-	add	edi,eax
-	vpslld	xmm2,xmm2,2
-	vmovdqa	xmm0,[64+esp]
-	shrd	eax,eax,7
-	xor	ebp,ecx
-	vpxor	xmm6,xmm6,xmm1
-	mov	esi,edi
-	add	edx,DWORD [44+esp]
-	xor	eax,ebx
-	shld	edi,edi,5
-	vpxor	xmm6,xmm6,xmm2
-	add	edx,ebp
-	and	esi,eax
-	vmovdqa	xmm2,[112+esp]
-	xor	eax,ebx
-	add	edx,edi
-	shrd	edi,edi,7
-	xor	esi,ebx
-	vpalignr	xmm7,xmm4,xmm3,8
-	mov	ebp,edx
-	add	ecx,DWORD [48+esp]
-	vpaddd	xmm2,xmm2,xmm6
-	vmovdqa	[64+esp],xmm3
-	xor	edi,eax
-	shld	edx,edx,5
-	vpsrldq	xmm1,xmm6,4
-	add	ecx,esi
-	and	ebp,edi
-	vpxor	xmm7,xmm7,xmm3
-	xor	edi,eax
-	add	ecx,edx
-	vpxor	xmm1,xmm1,xmm5
-	shrd	edx,edx,7
-	xor	ebp,eax
-	vmovdqa	[32+esp],xmm2
-	mov	esi,ecx
-	add	ebx,DWORD [52+esp]
-	vpxor	xmm7,xmm7,xmm1
-	xor	edx,edi
-	shld	ecx,ecx,5
-	add	ebx,ebp
-	and	esi,edx
-	vpsrld	xmm1,xmm7,31
-	xor	edx,edi
-	add	ebx,ecx
-	shrd	ecx,ecx,7
-	xor	esi,edi
-	vpslldq	xmm3,xmm7,12
-	vpaddd	xmm7,xmm7,xmm7
-	mov	ebp,ebx
-	add	eax,DWORD [56+esp]
-	xor	ecx,edx
-	shld	ebx,ebx,5
-	vpsrld	xmm2,xmm3,30
-	vpor	xmm7,xmm7,xmm1
-	add	eax,esi
-	and	ebp,ecx
-	xor	ecx,edx
-	add	eax,ebx
-	vpslld	xmm3,xmm3,2
-	vmovdqa	xmm1,[80+esp]
-	shrd	ebx,ebx,7
-	xor	ebp,edx
-	vpxor	xmm7,xmm7,xmm2
-	mov	esi,eax
-	add	edi,DWORD [60+esp]
-	xor	ebx,ecx
-	shld	eax,eax,5
-	vpxor	xmm7,xmm7,xmm3
-	add	edi,ebp
-	and	esi,ebx
-	vmovdqa	xmm3,[112+esp]
-	xor	ebx,ecx
-	add	edi,eax
-	vpalignr	xmm2,xmm7,xmm6,8
-	vpxor	xmm0,xmm0,xmm4
-	shrd	eax,eax,7
-	xor	esi,ecx
-	mov	ebp,edi
-	add	edx,DWORD [esp]
-	vpxor	xmm0,xmm0,xmm1
-	vmovdqa	[80+esp],xmm4
-	xor	eax,ebx
-	shld	edi,edi,5
-	vmovdqa	xmm4,xmm3
-	vpaddd	xmm3,xmm3,xmm7
-	add	edx,esi
-	and	ebp,eax
-	vpxor	xmm0,xmm0,xmm2
-	xor	eax,ebx
-	add	edx,edi
-	shrd	edi,edi,7
-	xor	ebp,ebx
-	vpsrld	xmm2,xmm0,30
-	vmovdqa	[48+esp],xmm3
-	mov	esi,edx
-	add	ecx,DWORD [4+esp]
-	xor	edi,eax
-	shld	edx,edx,5
-	vpslld	xmm0,xmm0,2
-	add	ecx,ebp
-	and	esi,edi
-	xor	edi,eax
-	add	ecx,edx
-	shrd	edx,edx,7
-	xor	esi,eax
-	mov	ebp,ecx
-	add	ebx,DWORD [8+esp]
-	vpor	xmm0,xmm0,xmm2
-	xor	edx,edi
-	shld	ecx,ecx,5
-	vmovdqa	xmm2,[96+esp]
-	add	ebx,esi
-	and	ebp,edx
-	xor	edx,edi
-	add	ebx,ecx
-	add	eax,DWORD [12+esp]
-	xor	ebp,edi
-	mov	esi,ebx
-	shld	ebx,ebx,5
-	add	eax,ebp
-	xor	esi,edx
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	vpalignr	xmm3,xmm0,xmm7,8
-	vpxor	xmm1,xmm1,xmm5
-	add	edi,DWORD [16+esp]
-	xor	esi,ecx
-	mov	ebp,eax
-	shld	eax,eax,5
-	vpxor	xmm1,xmm1,xmm2
-	vmovdqa	[96+esp],xmm5
-	add	edi,esi
-	xor	ebp,ecx
-	vmovdqa	xmm5,xmm4
-	vpaddd	xmm4,xmm4,xmm0
-	shrd	ebx,ebx,7
-	add	edi,eax
-	vpxor	xmm1,xmm1,xmm3
-	add	edx,DWORD [20+esp]
-	xor	ebp,ebx
-	mov	esi,edi
-	shld	edi,edi,5
-	vpsrld	xmm3,xmm1,30
-	vmovdqa	[esp],xmm4
-	add	edx,ebp
-	xor	esi,ebx
-	shrd	eax,eax,7
-	add	edx,edi
-	vpslld	xmm1,xmm1,2
-	add	ecx,DWORD [24+esp]
-	xor	esi,eax
-	mov	ebp,edx
-	shld	edx,edx,5
-	add	ecx,esi
-	xor	ebp,eax
-	shrd	edi,edi,7
-	add	ecx,edx
-	vpor	xmm1,xmm1,xmm3
-	add	ebx,DWORD [28+esp]
-	xor	ebp,edi
-	vmovdqa	xmm3,[64+esp]
-	mov	esi,ecx
-	shld	ecx,ecx,5
-	add	ebx,ebp
-	xor	esi,edi
-	shrd	edx,edx,7
-	add	ebx,ecx
-	vpalignr	xmm4,xmm1,xmm0,8
-	vpxor	xmm2,xmm2,xmm6
-	add	eax,DWORD [32+esp]
-	xor	esi,edx
-	mov	ebp,ebx
-	shld	ebx,ebx,5
-	vpxor	xmm2,xmm2,xmm3
-	vmovdqa	[64+esp],xmm6
-	add	eax,esi
-	xor	ebp,edx
-	vmovdqa	xmm6,[128+esp]
-	vpaddd	xmm5,xmm5,xmm1
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	vpxor	xmm2,xmm2,xmm4
-	add	edi,DWORD [36+esp]
-	xor	ebp,ecx
-	mov	esi,eax
-	shld	eax,eax,5
-	vpsrld	xmm4,xmm2,30
-	vmovdqa	[16+esp],xmm5
-	add	edi,ebp
-	xor	esi,ecx
-	shrd	ebx,ebx,7
-	add	edi,eax
-	vpslld	xmm2,xmm2,2
-	add	edx,DWORD [40+esp]
-	xor	esi,ebx
-	mov	ebp,edi
-	shld	edi,edi,5
-	add	edx,esi
-	xor	ebp,ebx
-	shrd	eax,eax,7
-	add	edx,edi
-	vpor	xmm2,xmm2,xmm4
-	add	ecx,DWORD [44+esp]
-	xor	ebp,eax
-	vmovdqa	xmm4,[80+esp]
-	mov	esi,edx
-	shld	edx,edx,5
-	add	ecx,ebp
-	xor	esi,eax
-	shrd	edi,edi,7
-	add	ecx,edx
-	vpalignr	xmm5,xmm2,xmm1,8
-	vpxor	xmm3,xmm3,xmm7
-	add	ebx,DWORD [48+esp]
-	xor	esi,edi
-	mov	ebp,ecx
-	shld	ecx,ecx,5
-	vpxor	xmm3,xmm3,xmm4
-	vmovdqa	[80+esp],xmm7
-	add	ebx,esi
-	xor	ebp,edi
-	vmovdqa	xmm7,xmm6
-	vpaddd	xmm6,xmm6,xmm2
-	shrd	edx,edx,7
-	add	ebx,ecx
-	vpxor	xmm3,xmm3,xmm5
-	add	eax,DWORD [52+esp]
-	xor	ebp,edx
-	mov	esi,ebx
-	shld	ebx,ebx,5
-	vpsrld	xmm5,xmm3,30
-	vmovdqa	[32+esp],xmm6
-	add	eax,ebp
-	xor	esi,edx
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	vpslld	xmm3,xmm3,2
-	add	edi,DWORD [56+esp]
-	xor	esi,ecx
-	mov	ebp,eax
-	shld	eax,eax,5
-	add	edi,esi
-	xor	ebp,ecx
-	shrd	ebx,ebx,7
-	add	edi,eax
-	vpor	xmm3,xmm3,xmm5
-	add	edx,DWORD [60+esp]
-	xor	ebp,ebx
-	vmovdqa	xmm5,[96+esp]
-	mov	esi,edi
-	shld	edi,edi,5
-	add	edx,ebp
-	xor	esi,ebx
-	shrd	eax,eax,7
-	add	edx,edi
-	vpalignr	xmm6,xmm3,xmm2,8
-	vpxor	xmm4,xmm4,xmm0
-	add	ecx,DWORD [esp]
-	xor	esi,eax
-	mov	ebp,edx
-	shld	edx,edx,5
-	vpxor	xmm4,xmm4,xmm5
-	vmovdqa	[96+esp],xmm0
-	add	ecx,esi
-	xor	ebp,eax
-	vmovdqa	xmm0,xmm7
-	vpaddd	xmm7,xmm7,xmm3
-	shrd	edi,edi,7
-	add	ecx,edx
-	vpxor	xmm4,xmm4,xmm6
-	add	ebx,DWORD [4+esp]
-	xor	ebp,edi
-	mov	esi,ecx
-	shld	ecx,ecx,5
-	vpsrld	xmm6,xmm4,30
-	vmovdqa	[48+esp],xmm7
-	add	ebx,ebp
-	xor	esi,edi
-	shrd	edx,edx,7
-	add	ebx,ecx
-	vpslld	xmm4,xmm4,2
-	add	eax,DWORD [8+esp]
-	xor	esi,edx
-	mov	ebp,ebx
-	shld	ebx,ebx,5
-	add	eax,esi
-	xor	ebp,edx
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	vpor	xmm4,xmm4,xmm6
-	add	edi,DWORD [12+esp]
-	xor	ebp,ecx
-	vmovdqa	xmm6,[64+esp]
-	mov	esi,eax
-	shld	eax,eax,5
-	add	edi,ebp
-	xor	esi,ecx
-	shrd	ebx,ebx,7
-	add	edi,eax
-	vpalignr	xmm7,xmm4,xmm3,8
-	vpxor	xmm5,xmm5,xmm1
-	add	edx,DWORD [16+esp]
-	xor	esi,ebx
-	mov	ebp,edi
-	shld	edi,edi,5
-	vpxor	xmm5,xmm5,xmm6
-	vmovdqa	[64+esp],xmm1
-	add	edx,esi
-	xor	ebp,ebx
-	vmovdqa	xmm1,xmm0
-	vpaddd	xmm0,xmm0,xmm4
-	shrd	eax,eax,7
-	add	edx,edi
-	vpxor	xmm5,xmm5,xmm7
-	add	ecx,DWORD [20+esp]
-	xor	ebp,eax
-	mov	esi,edx
-	shld	edx,edx,5
-	vpsrld	xmm7,xmm5,30
-	vmovdqa	[esp],xmm0
-	add	ecx,ebp
-	xor	esi,eax
-	shrd	edi,edi,7
-	add	ecx,edx
-	vpslld	xmm5,xmm5,2
-	add	ebx,DWORD [24+esp]
-	xor	esi,edi
-	mov	ebp,ecx
-	shld	ecx,ecx,5
-	add	ebx,esi
-	xor	ebp,edi
-	shrd	edx,edx,7
-	add	ebx,ecx
-	vpor	xmm5,xmm5,xmm7
-	add	eax,DWORD [28+esp]
-	vmovdqa	xmm7,[80+esp]
-	shrd	ecx,ecx,7
-	mov	esi,ebx
-	xor	ebp,edx
-	shld	ebx,ebx,5
-	add	eax,ebp
-	xor	esi,ecx
-	xor	ecx,edx
-	add	eax,ebx
-	vpalignr	xmm0,xmm5,xmm4,8
-	vpxor	xmm6,xmm6,xmm2
-	add	edi,DWORD [32+esp]
-	and	esi,ecx
-	xor	ecx,edx
-	shrd	ebx,ebx,7
-	vpxor	xmm6,xmm6,xmm7
-	vmovdqa	[80+esp],xmm2
-	mov	ebp,eax
-	xor	esi,ecx
-	vmovdqa	xmm2,xmm1
-	vpaddd	xmm1,xmm1,xmm5
-	shld	eax,eax,5
-	add	edi,esi
-	vpxor	xmm6,xmm6,xmm0
-	xor	ebp,ebx
-	xor	ebx,ecx
-	add	edi,eax
-	add	edx,DWORD [36+esp]
-	vpsrld	xmm0,xmm6,30
-	vmovdqa	[16+esp],xmm1
-	and	ebp,ebx
-	xor	ebx,ecx
-	shrd	eax,eax,7
-	mov	esi,edi
-	vpslld	xmm6,xmm6,2
-	xor	ebp,ebx
-	shld	edi,edi,5
-	add	edx,ebp
-	xor	esi,eax
-	xor	eax,ebx
-	add	edx,edi
-	add	ecx,DWORD [40+esp]
-	and	esi,eax
-	vpor	xmm6,xmm6,xmm0
-	xor	eax,ebx
-	shrd	edi,edi,7
-	vmovdqa	xmm0,[96+esp]
-	mov	ebp,edx
-	xor	esi,eax
-	shld	edx,edx,5
-	add	ecx,esi
-	xor	ebp,edi
-	xor	edi,eax
-	add	ecx,edx
-	add	ebx,DWORD [44+esp]
-	and	ebp,edi
-	xor	edi,eax
-	shrd	edx,edx,7
-	mov	esi,ecx
-	xor	ebp,edi
-	shld	ecx,ecx,5
-	add	ebx,ebp
-	xor	esi,edx
-	xor	edx,edi
-	add	ebx,ecx
-	vpalignr	xmm1,xmm6,xmm5,8
-	vpxor	xmm7,xmm7,xmm3
-	add	eax,DWORD [48+esp]
-	and	esi,edx
-	xor	edx,edi
-	shrd	ecx,ecx,7
-	vpxor	xmm7,xmm7,xmm0
-	vmovdqa	[96+esp],xmm3
-	mov	ebp,ebx
-	xor	esi,edx
-	vmovdqa	xmm3,[144+esp]
-	vpaddd	xmm2,xmm2,xmm6
-	shld	ebx,ebx,5
-	add	eax,esi
-	vpxor	xmm7,xmm7,xmm1
-	xor	ebp,ecx
-	xor	ecx,edx
-	add	eax,ebx
-	add	edi,DWORD [52+esp]
-	vpsrld	xmm1,xmm7,30
-	vmovdqa	[32+esp],xmm2
-	and	ebp,ecx
-	xor	ecx,edx
-	shrd	ebx,ebx,7
-	mov	esi,eax
-	vpslld	xmm7,xmm7,2
-	xor	ebp,ecx
-	shld	eax,eax,5
-	add	edi,ebp
-	xor	esi,ebx
-	xor	ebx,ecx
-	add	edi,eax
-	add	edx,DWORD [56+esp]
-	and	esi,ebx
-	vpor	xmm7,xmm7,xmm1
-	xor	ebx,ecx
-	shrd	eax,eax,7
-	vmovdqa	xmm1,[64+esp]
-	mov	ebp,edi
-	xor	esi,ebx
-	shld	edi,edi,5
-	add	edx,esi
-	xor	ebp,eax
-	xor	eax,ebx
-	add	edx,edi
-	add	ecx,DWORD [60+esp]
-	and	ebp,eax
-	xor	eax,ebx
-	shrd	edi,edi,7
-	mov	esi,edx
-	xor	ebp,eax
-	shld	edx,edx,5
-	add	ecx,ebp
-	xor	esi,edi
-	xor	edi,eax
-	add	ecx,edx
-	vpalignr	xmm2,xmm7,xmm6,8
-	vpxor	xmm0,xmm0,xmm4
-	add	ebx,DWORD [esp]
-	and	esi,edi
-	xor	edi,eax
-	shrd	edx,edx,7
-	vpxor	xmm0,xmm0,xmm1
-	vmovdqa	[64+esp],xmm4
-	mov	ebp,ecx
-	xor	esi,edi
-	vmovdqa	xmm4,xmm3
-	vpaddd	xmm3,xmm3,xmm7
-	shld	ecx,ecx,5
-	add	ebx,esi
-	vpxor	xmm0,xmm0,xmm2
-	xor	ebp,edx
-	xor	edx,edi
-	add	ebx,ecx
-	add	eax,DWORD [4+esp]
-	vpsrld	xmm2,xmm0,30
-	vmovdqa	[48+esp],xmm3
-	and	ebp,edx
-	xor	edx,edi
-	shrd	ecx,ecx,7
-	mov	esi,ebx
-	vpslld	xmm0,xmm0,2
-	xor	ebp,edx
-	shld	ebx,ebx,5
-	add	eax,ebp
-	xor	esi,ecx
-	xor	ecx,edx
-	add	eax,ebx
-	add	edi,DWORD [8+esp]
-	and	esi,ecx
-	vpor	xmm0,xmm0,xmm2
-	xor	ecx,edx
-	shrd	ebx,ebx,7
-	vmovdqa	xmm2,[80+esp]
-	mov	ebp,eax
-	xor	esi,ecx
-	shld	eax,eax,5
-	add	edi,esi
-	xor	ebp,ebx
-	xor	ebx,ecx
-	add	edi,eax
-	add	edx,DWORD [12+esp]
-	and	ebp,ebx
-	xor	ebx,ecx
-	shrd	eax,eax,7
-	mov	esi,edi
-	xor	ebp,ebx
-	shld	edi,edi,5
-	add	edx,ebp
-	xor	esi,eax
-	xor	eax,ebx
-	add	edx,edi
-	vpalignr	xmm3,xmm0,xmm7,8
-	vpxor	xmm1,xmm1,xmm5
-	add	ecx,DWORD [16+esp]
-	and	esi,eax
-	xor	eax,ebx
-	shrd	edi,edi,7
-	vpxor	xmm1,xmm1,xmm2
-	vmovdqa	[80+esp],xmm5
-	mov	ebp,edx
-	xor	esi,eax
-	vmovdqa	xmm5,xmm4
-	vpaddd	xmm4,xmm4,xmm0
-	shld	edx,edx,5
-	add	ecx,esi
-	vpxor	xmm1,xmm1,xmm3
-	xor	ebp,edi
-	xor	edi,eax
-	add	ecx,edx
-	add	ebx,DWORD [20+esp]
-	vpsrld	xmm3,xmm1,30
-	vmovdqa	[esp],xmm4
-	and	ebp,edi
-	xor	edi,eax
-	shrd	edx,edx,7
-	mov	esi,ecx
-	vpslld	xmm1,xmm1,2
-	xor	ebp,edi
-	shld	ecx,ecx,5
-	add	ebx,ebp
-	xor	esi,edx
-	xor	edx,edi
-	add	ebx,ecx
-	add	eax,DWORD [24+esp]
-	and	esi,edx
-	vpor	xmm1,xmm1,xmm3
-	xor	edx,edi
-	shrd	ecx,ecx,7
-	vmovdqa	xmm3,[96+esp]
-	mov	ebp,ebx
-	xor	esi,edx
-	shld	ebx,ebx,5
-	add	eax,esi
-	xor	ebp,ecx
-	xor	ecx,edx
-	add	eax,ebx
-	add	edi,DWORD [28+esp]
-	and	ebp,ecx
-	xor	ecx,edx
-	shrd	ebx,ebx,7
-	mov	esi,eax
-	xor	ebp,ecx
-	shld	eax,eax,5
-	add	edi,ebp
-	xor	esi,ebx
-	xor	ebx,ecx
-	add	edi,eax
-	vpalignr	xmm4,xmm1,xmm0,8
-	vpxor	xmm2,xmm2,xmm6
-	add	edx,DWORD [32+esp]
-	and	esi,ebx
-	xor	ebx,ecx
-	shrd	eax,eax,7
-	vpxor	xmm2,xmm2,xmm3
-	vmovdqa	[96+esp],xmm6
-	mov	ebp,edi
-	xor	esi,ebx
-	vmovdqa	xmm6,xmm5
-	vpaddd	xmm5,xmm5,xmm1
-	shld	edi,edi,5
-	add	edx,esi
-	vpxor	xmm2,xmm2,xmm4
-	xor	ebp,eax
-	xor	eax,ebx
-	add	edx,edi
-	add	ecx,DWORD [36+esp]
-	vpsrld	xmm4,xmm2,30
-	vmovdqa	[16+esp],xmm5
-	and	ebp,eax
-	xor	eax,ebx
-	shrd	edi,edi,7
-	mov	esi,edx
-	vpslld	xmm2,xmm2,2
-	xor	ebp,eax
-	shld	edx,edx,5
-	add	ecx,ebp
-	xor	esi,edi
-	xor	edi,eax
-	add	ecx,edx
-	add	ebx,DWORD [40+esp]
-	and	esi,edi
-	vpor	xmm2,xmm2,xmm4
-	xor	edi,eax
-	shrd	edx,edx,7
-	vmovdqa	xmm4,[64+esp]
-	mov	ebp,ecx
-	xor	esi,edi
-	shld	ecx,ecx,5
-	add	ebx,esi
-	xor	ebp,edx
-	xor	edx,edi
-	add	ebx,ecx
-	add	eax,DWORD [44+esp]
-	and	ebp,edx
-	xor	edx,edi
-	shrd	ecx,ecx,7
-	mov	esi,ebx
-	xor	ebp,edx
-	shld	ebx,ebx,5
-	add	eax,ebp
-	xor	esi,edx
-	add	eax,ebx
-	vpalignr	xmm5,xmm2,xmm1,8
-	vpxor	xmm3,xmm3,xmm7
-	add	edi,DWORD [48+esp]
-	xor	esi,ecx
-	mov	ebp,eax
-	shld	eax,eax,5
-	vpxor	xmm3,xmm3,xmm4
-	vmovdqa	[64+esp],xmm7
-	add	edi,esi
-	xor	ebp,ecx
-	vmovdqa	xmm7,xmm6
-	vpaddd	xmm6,xmm6,xmm2
-	shrd	ebx,ebx,7
-	add	edi,eax
-	vpxor	xmm3,xmm3,xmm5
-	add	edx,DWORD [52+esp]
-	xor	ebp,ebx
-	mov	esi,edi
-	shld	edi,edi,5
-	vpsrld	xmm5,xmm3,30
-	vmovdqa	[32+esp],xmm6
-	add	edx,ebp
-	xor	esi,ebx
-	shrd	eax,eax,7
-	add	edx,edi
-	vpslld	xmm3,xmm3,2
-	add	ecx,DWORD [56+esp]
-	xor	esi,eax
-	mov	ebp,edx
-	shld	edx,edx,5
-	add	ecx,esi
-	xor	ebp,eax
-	shrd	edi,edi,7
-	add	ecx,edx
-	vpor	xmm3,xmm3,xmm5
-	add	ebx,DWORD [60+esp]
-	xor	ebp,edi
-	mov	esi,ecx
-	shld	ecx,ecx,5
-	add	ebx,ebp
-	xor	esi,edi
-	shrd	edx,edx,7
-	add	ebx,ecx
-	add	eax,DWORD [esp]
-	vpaddd	xmm7,xmm7,xmm3
-	xor	esi,edx
-	mov	ebp,ebx
-	shld	ebx,ebx,5
-	add	eax,esi
-	vmovdqa	[48+esp],xmm7
-	xor	ebp,edx
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	add	edi,DWORD [4+esp]
-	xor	ebp,ecx
-	mov	esi,eax
-	shld	eax,eax,5
-	add	edi,ebp
-	xor	esi,ecx
-	shrd	ebx,ebx,7
-	add	edi,eax
-	add	edx,DWORD [8+esp]
-	xor	esi,ebx
-	mov	ebp,edi
-	shld	edi,edi,5
-	add	edx,esi
-	xor	ebp,ebx
-	shrd	eax,eax,7
-	add	edx,edi
-	add	ecx,DWORD [12+esp]
-	xor	ebp,eax
-	mov	esi,edx
-	shld	edx,edx,5
-	add	ecx,ebp
-	xor	esi,eax
-	shrd	edi,edi,7
-	add	ecx,edx
-	mov	ebp,DWORD [196+esp]
-	cmp	ebp,DWORD [200+esp]
-	je	NEAR L$006done
-	vmovdqa	xmm7,[160+esp]
-	vmovdqa	xmm6,[176+esp]
-	vmovdqu	xmm0,[ebp]
-	vmovdqu	xmm1,[16+ebp]
-	vmovdqu	xmm2,[32+ebp]
-	vmovdqu	xmm3,[48+ebp]
-	add	ebp,64
-	vpshufb	xmm0,xmm0,xmm6
-	mov	DWORD [196+esp],ebp
-	vmovdqa	[96+esp],xmm7
-	add	ebx,DWORD [16+esp]
-	xor	esi,edi
-	vpshufb	xmm1,xmm1,xmm6
-	mov	ebp,ecx
-	shld	ecx,ecx,5
-	vpaddd	xmm4,xmm0,xmm7
-	add	ebx,esi
-	xor	ebp,edi
-	shrd	edx,edx,7
-	add	ebx,ecx
-	vmovdqa	[esp],xmm4
-	add	eax,DWORD [20+esp]
-	xor	ebp,edx
-	mov	esi,ebx
-	shld	ebx,ebx,5
-	add	eax,ebp
-	xor	esi,edx
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	add	edi,DWORD [24+esp]
-	xor	esi,ecx
-	mov	ebp,eax
-	shld	eax,eax,5
-	add	edi,esi
-	xor	ebp,ecx
-	shrd	ebx,ebx,7
-	add	edi,eax
-	add	edx,DWORD [28+esp]
-	xor	ebp,ebx
-	mov	esi,edi
-	shld	edi,edi,5
-	add	edx,ebp
-	xor	esi,ebx
-	shrd	eax,eax,7
-	add	edx,edi
-	add	ecx,DWORD [32+esp]
-	xor	esi,eax
-	vpshufb	xmm2,xmm2,xmm6
-	mov	ebp,edx
-	shld	edx,edx,5
-	vpaddd	xmm5,xmm1,xmm7
-	add	ecx,esi
-	xor	ebp,eax
-	shrd	edi,edi,7
-	add	ecx,edx
-	vmovdqa	[16+esp],xmm5
-	add	ebx,DWORD [36+esp]
-	xor	ebp,edi
-	mov	esi,ecx
-	shld	ecx,ecx,5
-	add	ebx,ebp
-	xor	esi,edi
-	shrd	edx,edx,7
-	add	ebx,ecx
-	add	eax,DWORD [40+esp]
-	xor	esi,edx
-	mov	ebp,ebx
-	shld	ebx,ebx,5
-	add	eax,esi
-	xor	ebp,edx
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	add	edi,DWORD [44+esp]
-	xor	ebp,ecx
-	mov	esi,eax
-	shld	eax,eax,5
-	add	edi,ebp
-	xor	esi,ecx
-	shrd	ebx,ebx,7
-	add	edi,eax
-	add	edx,DWORD [48+esp]
-	xor	esi,ebx
-	vpshufb	xmm3,xmm3,xmm6
-	mov	ebp,edi
-	shld	edi,edi,5
-	vpaddd	xmm6,xmm2,xmm7
-	add	edx,esi
-	xor	ebp,ebx
-	shrd	eax,eax,7
-	add	edx,edi
-	vmovdqa	[32+esp],xmm6
-	add	ecx,DWORD [52+esp]
-	xor	ebp,eax
-	mov	esi,edx
-	shld	edx,edx,5
-	add	ecx,ebp
-	xor	esi,eax
-	shrd	edi,edi,7
-	add	ecx,edx
-	add	ebx,DWORD [56+esp]
-	xor	esi,edi
-	mov	ebp,ecx
-	shld	ecx,ecx,5
-	add	ebx,esi
-	xor	ebp,edi
-	shrd	edx,edx,7
-	add	ebx,ecx
-	add	eax,DWORD [60+esp]
-	xor	ebp,edx
-	mov	esi,ebx
-	shld	ebx,ebx,5
-	add	eax,ebp
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	mov	ebp,DWORD [192+esp]
-	add	eax,DWORD [ebp]
-	add	esi,DWORD [4+ebp]
-	add	ecx,DWORD [8+ebp]
-	mov	DWORD [ebp],eax
-	add	edx,DWORD [12+ebp]
-	mov	DWORD [4+ebp],esi
-	add	edi,DWORD [16+ebp]
-	mov	ebx,ecx
-	mov	DWORD [8+ebp],ecx
-	xor	ebx,edx
-	mov	DWORD [12+ebp],edx
-	mov	DWORD [16+ebp],edi
-	mov	ebp,esi
-	and	esi,ebx
-	mov	ebx,ebp
-	jmp	NEAR L$005loop
-align	16
-L$006done:
-	add	ebx,DWORD [16+esp]
-	xor	esi,edi
-	mov	ebp,ecx
-	shld	ecx,ecx,5
-	add	ebx,esi
-	xor	ebp,edi
-	shrd	edx,edx,7
-	add	ebx,ecx
-	add	eax,DWORD [20+esp]
-	xor	ebp,edx
-	mov	esi,ebx
-	shld	ebx,ebx,5
-	add	eax,ebp
-	xor	esi,edx
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	add	edi,DWORD [24+esp]
-	xor	esi,ecx
-	mov	ebp,eax
-	shld	eax,eax,5
-	add	edi,esi
-	xor	ebp,ecx
-	shrd	ebx,ebx,7
-	add	edi,eax
-	add	edx,DWORD [28+esp]
-	xor	ebp,ebx
-	mov	esi,edi
-	shld	edi,edi,5
-	add	edx,ebp
-	xor	esi,ebx
-	shrd	eax,eax,7
-	add	edx,edi
-	add	ecx,DWORD [32+esp]
-	xor	esi,eax
-	mov	ebp,edx
-	shld	edx,edx,5
-	add	ecx,esi
-	xor	ebp,eax
-	shrd	edi,edi,7
-	add	ecx,edx
-	add	ebx,DWORD [36+esp]
-	xor	ebp,edi
-	mov	esi,ecx
-	shld	ecx,ecx,5
-	add	ebx,ebp
-	xor	esi,edi
-	shrd	edx,edx,7
-	add	ebx,ecx
-	add	eax,DWORD [40+esp]
-	xor	esi,edx
-	mov	ebp,ebx
-	shld	ebx,ebx,5
-	add	eax,esi
-	xor	ebp,edx
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	add	edi,DWORD [44+esp]
-	xor	ebp,ecx
-	mov	esi,eax
-	shld	eax,eax,5
-	add	edi,ebp
-	xor	esi,ecx
-	shrd	ebx,ebx,7
-	add	edi,eax
-	add	edx,DWORD [48+esp]
-	xor	esi,ebx
-	mov	ebp,edi
-	shld	edi,edi,5
-	add	edx,esi
-	xor	ebp,ebx
-	shrd	eax,eax,7
-	add	edx,edi
-	add	ecx,DWORD [52+esp]
-	xor	ebp,eax
-	mov	esi,edx
-	shld	edx,edx,5
-	add	ecx,ebp
-	xor	esi,eax
-	shrd	edi,edi,7
-	add	ecx,edx
-	add	ebx,DWORD [56+esp]
-	xor	esi,edi
-	mov	ebp,ecx
-	shld	ecx,ecx,5
-	add	ebx,esi
-	xor	ebp,edi
-	shrd	edx,edx,7
-	add	ebx,ecx
-	add	eax,DWORD [60+esp]
-	xor	ebp,edx
-	mov	esi,ebx
-	shld	ebx,ebx,5
-	add	eax,ebp
-	shrd	ecx,ecx,7
-	add	eax,ebx
-	vzeroall
-	mov	ebp,DWORD [192+esp]
-	add	eax,DWORD [ebp]
-	mov	esp,DWORD [204+esp]
-	add	esi,DWORD [4+ebp]
-	add	ecx,DWORD [8+ebp]
-	mov	DWORD [ebp],eax
-	add	edx,DWORD [12+ebp]
-	mov	DWORD [4+ebp],esi
-	add	edi,DWORD [16+ebp]
-	mov	DWORD [8+ebp],ecx
-	mov	DWORD [12+ebp],edx
-	mov	DWORD [16+ebp],edi
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-align	64
-L$K_XX_XX:
-dd	1518500249,1518500249,1518500249,1518500249
-dd	1859775393,1859775393,1859775393,1859775393
-dd	2400959708,2400959708,2400959708,2400959708
-dd	3395469782,3395469782,3395469782,3395469782
-dd	66051,67438087,134810123,202182159
-db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-db	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
-db	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
-db	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
-db	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-ios.ios.arm.S
deleted file mode 100644
index 4d1a2e21..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-ios.ios.arm.S
+++ /dev/null
@@ -1,1494 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__APPLE__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <CCryptoBoringSSL_arm_arch.h>
-
-.text
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#endif
-
-.globl	_sha1_block_data_order_nohw
-.private_extern	_sha1_block_data_order_nohw
-#ifdef __thumb2__
-.thumb_func	_sha1_block_data_order_nohw
-#endif
-
-.align	5
-_sha1_block_data_order_nohw:
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
-	ldmia	r0,{r3,r4,r5,r6,r7}
-Lloop:
-	ldr	r8,LK_00_19
-	mov	r14,sp
-	sub	sp,sp,#15*4
-	mov	r5,r5,ror#30
-	mov	r6,r6,ror#30
-	mov	r7,r7,ror#30		@ [6]
-L_00_15:
-#if __ARM_ARCH<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	eor	r10,r5,r6			@ F_xx_xx
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r4,r10,ror#2
-	add	r7,r7,r9			@ E+=X[i]
-	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r6,r8,r6,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r4,r5			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r6,r8,r6,ror#2			@ E+=K_00_19
-	eor	r10,r4,r5			@ F_xx_xx
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r3,r10,ror#2
-	add	r6,r6,r9			@ E+=X[i]
-	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r5,r8,r5,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r3,r4			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r5,r8,r5,ror#2			@ E+=K_00_19
-	eor	r10,r3,r4			@ F_xx_xx
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r7,r10,ror#2
-	add	r5,r5,r9			@ E+=X[i]
-	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r4,r8,r4,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r7,r3			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r4,r8,r4,ror#2			@ E+=K_00_19
-	eor	r10,r7,r3			@ F_xx_xx
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r6,r10,ror#2
-	add	r4,r4,r9			@ E+=X[i]
-	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r3,r8,r3,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r6,r7			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r3,r8,r3,ror#2			@ E+=K_00_19
-	eor	r10,r6,r7			@ F_xx_xx
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r5,r10,ror#2
-	add	r3,r3,r9			@ E+=X[i]
-	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
-#if defined(__thumb2__)
-	mov	r12,sp
-	teq	r14,r12
-#else
-	teq	r14,sp
-#endif
-	bne	L_00_15		@ [((11+4)*5+2)*3]
-	sub	sp,sp,#25*4
-#if __ARM_ARCH<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	eor	r10,r5,r6			@ F_xx_xx
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r4,r10,ror#2
-	add	r7,r7,r9			@ E+=X[i]
-	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r4,r5			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r3,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r6,r6,r9			@ E+=X[i]
-	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
-	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r3,r4			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r7,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r5,r5,r9			@ E+=X[i]
-	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
-	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r7,r3			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r6,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r4,r4,r9			@ E+=X[i]
-	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
-	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r6,r7			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r5,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r3,r3,r9			@ E+=X[i]
-	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
-	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
-
-	ldr	r8,LK_20_39		@ [+15+16*4]
-	cmn	sp,#0			@ [+3], clear carry to denote 20_39
-L_20_39_or_60_79:
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r5,r6			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor	r10,r4,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r7,r7,r9			@ E+=X[i]
-	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r4,r5			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor	r10,r3,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r6,r6,r9			@ E+=X[i]
-	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r3,r4			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor	r10,r7,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r5,r5,r9			@ E+=X[i]
-	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r7,r3			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor	r10,r6,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r4,r4,r9			@ E+=X[i]
-	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r6,r7			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor	r10,r5,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r3,r3,r9			@ E+=X[i]
-	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
-#if defined(__thumb2__)
-	mov	r12,sp
-	teq	r14,r12
-#else
-	teq	r14,sp			@ preserve carry
-#endif
-	bne	L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
-	bcs	L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
-
-	ldr	r8,LK_40_59
-	sub	sp,sp,#20*4		@ [+2]
-L_40_59:
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r5,r6			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r4,r10,ror#2					@ F_xx_xx
-	and	r11,r5,r6					@ F_xx_xx
-	add	r7,r7,r9			@ E+=X[i]
-	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
-	add	r7,r7,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r4,r5			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r3,r10,ror#2					@ F_xx_xx
-	and	r11,r4,r5					@ F_xx_xx
-	add	r6,r6,r9			@ E+=X[i]
-	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
-	add	r6,r6,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r3,r4			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r7,r10,ror#2					@ F_xx_xx
-	and	r11,r3,r4					@ F_xx_xx
-	add	r5,r5,r9			@ E+=X[i]
-	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
-	add	r5,r5,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r7,r3			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r6,r10,ror#2					@ F_xx_xx
-	and	r11,r7,r3					@ F_xx_xx
-	add	r4,r4,r9			@ E+=X[i]
-	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
-	add	r4,r4,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r6,r7			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r5,r10,ror#2					@ F_xx_xx
-	and	r11,r6,r7					@ F_xx_xx
-	add	r3,r3,r9			@ E+=X[i]
-	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
-	add	r3,r3,r11,ror#2
-#if defined(__thumb2__)
-	mov	r12,sp
-	teq	r14,r12
-#else
-	teq	r14,sp
-#endif
-	bne	L_40_59		@ [+((12+5)*5+2)*4]
-
-	ldr	r8,LK_60_79
-	sub	sp,sp,#20*4
-	cmp	sp,#0			@ set carry to denote 60_79
-	b	L_20_39_or_60_79	@ [+4], spare 300 bytes
-L_done:
-	add	sp,sp,#80*4		@ "deallocate" stack frame
-	ldmia	r0,{r8,r9,r10,r11,r12}
-	add	r3,r8,r3
-	add	r4,r9,r4
-	add	r5,r10,r5,ror#2
-	add	r6,r11,r6,ror#2
-	add	r7,r12,r7,ror#2
-	stmia	r0,{r3,r4,r5,r6,r7}
-	teq	r1,r2
-	bne	Lloop			@ [+18], total 1307
-
-#if __ARM_ARCH>=5
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-#else
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-#endif
-
-
-.align	5
-LK_00_19:.word	0x5a827999
-LK_20_39:.word	0x6ed9eba1
-LK_40_59:.word	0x8f1bbcdc
-LK_60_79:.word	0xca62c1d6
-.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	5
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl	_sha1_block_data_order_neon
-.private_extern	_sha1_block_data_order_neon
-#ifdef __thumb2__
-.thumb_func	_sha1_block_data_order_neon
-#endif
-.align	4
-_sha1_block_data_order_neon:
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
-	@ dmb				@ errata #451034 on early Cortex A8
-	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so
-	mov	r14,sp
-	sub	r12,sp,#64
-	adr	r8,LK_00_19
-	bic	r12,r12,#15		@ align for 128-bit stores
-
-	ldmia	r0,{r3,r4,r5,r6,r7}	@ load context
-	mov	sp,r12		@ alloca
-
-	vld1.8	{q0,q1},[r1]!	@ handles unaligned
-	veor	q15,q15,q15
-	vld1.8	{q2,q3},[r1]!
-	vld1.32	{d28[],d29[]},[r8,:32]!	@ load K_00_19
-	vrev32.8	q0,q0		@ yes, even on
-	vrev32.8	q1,q1		@ big-endian...
-	vrev32.8	q2,q2
-	vadd.i32	q8,q0,q14
-	vrev32.8	q3,q3
-	vadd.i32	q9,q1,q14
-	vst1.32	{q8},[r12,:128]!
-	vadd.i32	q10,q2,q14
-	vst1.32	{q9},[r12,:128]!
-	vst1.32	{q10},[r12,:128]!
-	ldr	r9,[sp]			@ big RAW stall
-
-Loop_neon:
-	vext.8	q8,q0,q1,#8
-	bic	r10,r6,r4
-	add	r7,r7,r9
-	and	r11,r5,r4
-	vadd.i32	q13,q3,q14
-	ldr	r9,[sp,#4]
-	add	r7,r7,r3,ror#27
-	vext.8	q12,q3,q15,#4
-	eor	r11,r11,r10
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	veor	q8,q8,q0
-	bic	r10,r5,r3
-	add	r6,r6,r9
-	veor	q12,q12,q2
-	and	r11,r4,r3
-	ldr	r9,[sp,#8]
-	veor	q12,q12,q8
-	add	r6,r6,r7,ror#27
-	eor	r11,r11,r10
-	vst1.32	{q13},[r12,:128]!
-	sub	r12,r12,#64
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vext.8	q13,q15,q12,#4
-	bic	r10,r4,r7
-	add	r5,r5,r9
-	vadd.i32	q8,q12,q12
-	and	r11,r3,r7
-	ldr	r9,[sp,#12]
-	vsri.32	q8,q12,#31
-	add	r5,r5,r6,ror#27
-	eor	r11,r11,r10
-	mov	r7,r7,ror#2
-	vshr.u32	q12,q13,#30
-	add	r5,r5,r11
-	bic	r10,r3,r6
-	vshl.u32	q13,q13,#2
-	add	r4,r4,r9
-	and	r11,r7,r6
-	veor	q8,q8,q12
-	ldr	r9,[sp,#16]
-	add	r4,r4,r5,ror#27
-	veor	q8,q8,q13
-	eor	r11,r11,r10
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vext.8	q9,q1,q2,#8
-	bic	r10,r7,r5
-	add	r3,r3,r9
-	and	r11,r6,r5
-	vadd.i32	q13,q8,q14
-	ldr	r9,[sp,#20]
-	vld1.32	{d28[],d29[]},[r8,:32]!
-	add	r3,r3,r4,ror#27
-	vext.8	q12,q8,q15,#4
-	eor	r11,r11,r10
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	veor	q9,q9,q1
-	bic	r10,r6,r4
-	add	r7,r7,r9
-	veor	q12,q12,q3
-	and	r11,r5,r4
-	ldr	r9,[sp,#24]
-	veor	q12,q12,q9
-	add	r7,r7,r3,ror#27
-	eor	r11,r11,r10
-	vst1.32	{q13},[r12,:128]!
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vext.8	q13,q15,q12,#4
-	bic	r10,r5,r3
-	add	r6,r6,r9
-	vadd.i32	q9,q12,q12
-	and	r11,r4,r3
-	ldr	r9,[sp,#28]
-	vsri.32	q9,q12,#31
-	add	r6,r6,r7,ror#27
-	eor	r11,r11,r10
-	mov	r3,r3,ror#2
-	vshr.u32	q12,q13,#30
-	add	r6,r6,r11
-	bic	r10,r4,r7
-	vshl.u32	q13,q13,#2
-	add	r5,r5,r9
-	and	r11,r3,r7
-	veor	q9,q9,q12
-	ldr	r9,[sp,#32]
-	add	r5,r5,r6,ror#27
-	veor	q9,q9,q13
-	eor	r11,r11,r10
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	vext.8	q10,q2,q3,#8
-	bic	r10,r3,r6
-	add	r4,r4,r9
-	and	r11,r7,r6
-	vadd.i32	q13,q9,q14
-	ldr	r9,[sp,#36]
-	add	r4,r4,r5,ror#27
-	vext.8	q12,q9,q15,#4
-	eor	r11,r11,r10
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	veor	q10,q10,q2
-	bic	r10,r7,r5
-	add	r3,r3,r9
-	veor	q12,q12,q8
-	and	r11,r6,r5
-	ldr	r9,[sp,#40]
-	veor	q12,q12,q10
-	add	r3,r3,r4,ror#27
-	eor	r11,r11,r10
-	vst1.32	{q13},[r12,:128]!
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	vext.8	q13,q15,q12,#4
-	bic	r10,r6,r4
-	add	r7,r7,r9
-	vadd.i32	q10,q12,q12
-	and	r11,r5,r4
-	ldr	r9,[sp,#44]
-	vsri.32	q10,q12,#31
-	add	r7,r7,r3,ror#27
-	eor	r11,r11,r10
-	mov	r4,r4,ror#2
-	vshr.u32	q12,q13,#30
-	add	r7,r7,r11
-	bic	r10,r5,r3
-	vshl.u32	q13,q13,#2
-	add	r6,r6,r9
-	and	r11,r4,r3
-	veor	q10,q10,q12
-	ldr	r9,[sp,#48]
-	add	r6,r6,r7,ror#27
-	veor	q10,q10,q13
-	eor	r11,r11,r10
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vext.8	q11,q3,q8,#8
-	bic	r10,r4,r7
-	add	r5,r5,r9
-	and	r11,r3,r7
-	vadd.i32	q13,q10,q14
-	ldr	r9,[sp,#52]
-	add	r5,r5,r6,ror#27
-	vext.8	q12,q10,q15,#4
-	eor	r11,r11,r10
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	veor	q11,q11,q3
-	bic	r10,r3,r6
-	add	r4,r4,r9
-	veor	q12,q12,q9
-	and	r11,r7,r6
-	ldr	r9,[sp,#56]
-	veor	q12,q12,q11
-	add	r4,r4,r5,ror#27
-	eor	r11,r11,r10
-	vst1.32	{q13},[r12,:128]!
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vext.8	q13,q15,q12,#4
-	bic	r10,r7,r5
-	add	r3,r3,r9
-	vadd.i32	q11,q12,q12
-	and	r11,r6,r5
-	ldr	r9,[sp,#60]
-	vsri.32	q11,q12,#31
-	add	r3,r3,r4,ror#27
-	eor	r11,r11,r10
-	mov	r5,r5,ror#2
-	vshr.u32	q12,q13,#30
-	add	r3,r3,r11
-	bic	r10,r6,r4
-	vshl.u32	q13,q13,#2
-	add	r7,r7,r9
-	and	r11,r5,r4
-	veor	q11,q11,q12
-	ldr	r9,[sp,#0]
-	add	r7,r7,r3,ror#27
-	veor	q11,q11,q13
-	eor	r11,r11,r10
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vext.8	q12,q10,q11,#8
-	bic	r10,r5,r3
-	add	r6,r6,r9
-	and	r11,r4,r3
-	veor	q0,q0,q8
-	ldr	r9,[sp,#4]
-	add	r6,r6,r7,ror#27
-	veor	q0,q0,q1
-	eor	r11,r11,r10
-	mov	r3,r3,ror#2
-	vadd.i32	q13,q11,q14
-	add	r6,r6,r11
-	bic	r10,r4,r7
-	veor	q12,q12,q0
-	add	r5,r5,r9
-	and	r11,r3,r7
-	vshr.u32	q0,q12,#30
-	ldr	r9,[sp,#8]
-	add	r5,r5,r6,ror#27
-	vst1.32	{q13},[r12,:128]!
-	sub	r12,r12,#64
-	eor	r11,r11,r10
-	mov	r7,r7,ror#2
-	vsli.32	q0,q12,#2
-	add	r5,r5,r11
-	bic	r10,r3,r6
-	add	r4,r4,r9
-	and	r11,r7,r6
-	ldr	r9,[sp,#12]
-	add	r4,r4,r5,ror#27
-	eor	r11,r11,r10
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	bic	r10,r7,r5
-	add	r3,r3,r9
-	and	r11,r6,r5
-	ldr	r9,[sp,#16]
-	add	r3,r3,r4,ror#27
-	eor	r11,r11,r10
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	vext.8	q12,q11,q0,#8
-	eor	r10,r4,r6
-	add	r7,r7,r9
-	ldr	r9,[sp,#20]
-	veor	q1,q1,q9
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	veor	q1,q1,q2
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vadd.i32	q13,q0,q14
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	veor	q12,q12,q1
-	ldr	r9,[sp,#24]
-	eor	r11,r10,r4
-	vshr.u32	q1,q12,#30
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	vst1.32	{q13},[r12,:128]!
-	add	r6,r6,r11
-	eor	r10,r7,r4
-	vsli.32	q1,q12,#2
-	add	r5,r5,r9
-	ldr	r9,[sp,#28]
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	ldr	r9,[sp,#32]
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vext.8	q12,q0,q1,#8
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	ldr	r9,[sp,#36]
-	veor	q2,q2,q10
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	veor	q2,q2,q3
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	vadd.i32	q13,q1,q14
-	eor	r10,r4,r6
-	vld1.32	{d28[],d29[]},[r8,:32]!
-	add	r7,r7,r9
-	veor	q12,q12,q2
-	ldr	r9,[sp,#40]
-	eor	r11,r10,r5
-	vshr.u32	q2,q12,#30
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	vst1.32	{q13},[r12,:128]!
-	add	r7,r7,r11
-	eor	r10,r3,r5
-	vsli.32	q2,q12,#2
-	add	r6,r6,r9
-	ldr	r9,[sp,#44]
-	eor	r11,r10,r4
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	ldr	r9,[sp,#48]
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	vext.8	q12,q1,q2,#8
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	ldr	r9,[sp,#52]
-	veor	q3,q3,q11
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	veor	q3,q3,q8
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vadd.i32	q13,q2,q14
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	veor	q12,q12,q3
-	ldr	r9,[sp,#56]
-	eor	r11,r10,r6
-	vshr.u32	q3,q12,#30
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	vst1.32	{q13},[r12,:128]!
-	add	r3,r3,r11
-	eor	r10,r4,r6
-	vsli.32	q3,q12,#2
-	add	r7,r7,r9
-	ldr	r9,[sp,#60]
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	ldr	r9,[sp,#0]
-	eor	r11,r10,r4
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vext.8	q12,q2,q3,#8
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	ldr	r9,[sp,#4]
-	veor	q8,q8,q0
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	veor	q8,q8,q9
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	vadd.i32	q13,q3,q14
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	veor	q12,q12,q8
-	ldr	r9,[sp,#8]
-	eor	r11,r10,r7
-	vshr.u32	q8,q12,#30
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	vst1.32	{q13},[r12,:128]!
-	sub	r12,r12,#64
-	add	r4,r4,r11
-	eor	r10,r5,r7
-	vsli.32	q8,q12,#2
-	add	r3,r3,r9
-	ldr	r9,[sp,#12]
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	eor	r10,r4,r6
-	add	r7,r7,r9
-	ldr	r9,[sp,#16]
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vext.8	q12,q3,q8,#8
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	ldr	r9,[sp,#20]
-	veor	q9,q9,q1
-	eor	r11,r10,r4
-	add	r6,r6,r7,ror#27
-	veor	q9,q9,q10
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vadd.i32	q13,q8,q14
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	veor	q12,q12,q9
-	ldr	r9,[sp,#24]
-	eor	r11,r10,r3
-	vshr.u32	q9,q12,#30
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	vst1.32	{q13},[r12,:128]!
-	add	r5,r5,r11
-	eor	r10,r6,r3
-	vsli.32	q9,q12,#2
-	add	r4,r4,r9
-	ldr	r9,[sp,#28]
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	ldr	r9,[sp,#32]
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	vext.8	q12,q8,q9,#8
-	add	r7,r7,r9
-	and	r10,r5,r6
-	ldr	r9,[sp,#36]
-	veor	q10,q10,q2
-	add	r7,r7,r3,ror#27
-	eor	r11,r5,r6
-	veor	q10,q10,q11
-	add	r7,r7,r10
-	and	r11,r11,r4
-	vadd.i32	q13,q9,q14
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	veor	q12,q12,q10
-	add	r6,r6,r9
-	and	r10,r4,r5
-	vshr.u32	q10,q12,#30
-	ldr	r9,[sp,#40]
-	add	r6,r6,r7,ror#27
-	vst1.32	{q13},[r12,:128]!
-	eor	r11,r4,r5
-	add	r6,r6,r10
-	vsli.32	q10,q12,#2
-	and	r11,r11,r3
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	add	r5,r5,r9
-	and	r10,r3,r4
-	ldr	r9,[sp,#44]
-	add	r5,r5,r6,ror#27
-	eor	r11,r3,r4
-	add	r5,r5,r10
-	and	r11,r11,r7
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	add	r4,r4,r9
-	and	r10,r7,r3
-	ldr	r9,[sp,#48]
-	add	r4,r4,r5,ror#27
-	eor	r11,r7,r3
-	add	r4,r4,r10
-	and	r11,r11,r6
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vext.8	q12,q9,q10,#8
-	add	r3,r3,r9
-	and	r10,r6,r7
-	ldr	r9,[sp,#52]
-	veor	q11,q11,q3
-	add	r3,r3,r4,ror#27
-	eor	r11,r6,r7
-	veor	q11,q11,q0
-	add	r3,r3,r10
-	and	r11,r11,r5
-	vadd.i32	q13,q10,q14
-	mov	r5,r5,ror#2
-	vld1.32	{d28[],d29[]},[r8,:32]!
-	add	r3,r3,r11
-	veor	q12,q12,q11
-	add	r7,r7,r9
-	and	r10,r5,r6
-	vshr.u32	q11,q12,#30
-	ldr	r9,[sp,#56]
-	add	r7,r7,r3,ror#27
-	vst1.32	{q13},[r12,:128]!
-	eor	r11,r5,r6
-	add	r7,r7,r10
-	vsli.32	q11,q12,#2
-	and	r11,r11,r4
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	add	r6,r6,r9
-	and	r10,r4,r5
-	ldr	r9,[sp,#60]
-	add	r6,r6,r7,ror#27
-	eor	r11,r4,r5
-	add	r6,r6,r10
-	and	r11,r11,r3
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	add	r5,r5,r9
-	and	r10,r3,r4
-	ldr	r9,[sp,#0]
-	add	r5,r5,r6,ror#27
-	eor	r11,r3,r4
-	add	r5,r5,r10
-	and	r11,r11,r7
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	vext.8	q12,q10,q11,#8
-	add	r4,r4,r9
-	and	r10,r7,r3
-	ldr	r9,[sp,#4]
-	veor	q0,q0,q8
-	add	r4,r4,r5,ror#27
-	eor	r11,r7,r3
-	veor	q0,q0,q1
-	add	r4,r4,r10
-	and	r11,r11,r6
-	vadd.i32	q13,q11,q14
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	veor	q12,q12,q0
-	add	r3,r3,r9
-	and	r10,r6,r7
-	vshr.u32	q0,q12,#30
-	ldr	r9,[sp,#8]
-	add	r3,r3,r4,ror#27
-	vst1.32	{q13},[r12,:128]!
-	sub	r12,r12,#64
-	eor	r11,r6,r7
-	add	r3,r3,r10
-	vsli.32	q0,q12,#2
-	and	r11,r11,r5
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	add	r7,r7,r9
-	and	r10,r5,r6
-	ldr	r9,[sp,#12]
-	add	r7,r7,r3,ror#27
-	eor	r11,r5,r6
-	add	r7,r7,r10
-	and	r11,r11,r4
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	add	r6,r6,r9
-	and	r10,r4,r5
-	ldr	r9,[sp,#16]
-	add	r6,r6,r7,ror#27
-	eor	r11,r4,r5
-	add	r6,r6,r10
-	and	r11,r11,r3
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vext.8	q12,q11,q0,#8
-	add	r5,r5,r9
-	and	r10,r3,r4
-	ldr	r9,[sp,#20]
-	veor	q1,q1,q9
-	add	r5,r5,r6,ror#27
-	eor	r11,r3,r4
-	veor	q1,q1,q2
-	add	r5,r5,r10
-	and	r11,r11,r7
-	vadd.i32	q13,q0,q14
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	veor	q12,q12,q1
-	add	r4,r4,r9
-	and	r10,r7,r3
-	vshr.u32	q1,q12,#30
-	ldr	r9,[sp,#24]
-	add	r4,r4,r5,ror#27
-	vst1.32	{q13},[r12,:128]!
-	eor	r11,r7,r3
-	add	r4,r4,r10
-	vsli.32	q1,q12,#2
-	and	r11,r11,r6
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	add	r3,r3,r9
-	and	r10,r6,r7
-	ldr	r9,[sp,#28]
-	add	r3,r3,r4,ror#27
-	eor	r11,r6,r7
-	add	r3,r3,r10
-	and	r11,r11,r5
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	add	r7,r7,r9
-	and	r10,r5,r6
-	ldr	r9,[sp,#32]
-	add	r7,r7,r3,ror#27
-	eor	r11,r5,r6
-	add	r7,r7,r10
-	and	r11,r11,r4
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vext.8	q12,q0,q1,#8
-	add	r6,r6,r9
-	and	r10,r4,r5
-	ldr	r9,[sp,#36]
-	veor	q2,q2,q10
-	add	r6,r6,r7,ror#27
-	eor	r11,r4,r5
-	veor	q2,q2,q3
-	add	r6,r6,r10
-	and	r11,r11,r3
-	vadd.i32	q13,q1,q14
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	veor	q12,q12,q2
-	add	r5,r5,r9
-	and	r10,r3,r4
-	vshr.u32	q2,q12,#30
-	ldr	r9,[sp,#40]
-	add	r5,r5,r6,ror#27
-	vst1.32	{q13},[r12,:128]!
-	eor	r11,r3,r4
-	add	r5,r5,r10
-	vsli.32	q2,q12,#2
-	and	r11,r11,r7
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	add	r4,r4,r9
-	and	r10,r7,r3
-	ldr	r9,[sp,#44]
-	add	r4,r4,r5,ror#27
-	eor	r11,r7,r3
-	add	r4,r4,r10
-	and	r11,r11,r6
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	add	r3,r3,r9
-	and	r10,r6,r7
-	ldr	r9,[sp,#48]
-	add	r3,r3,r4,ror#27
-	eor	r11,r6,r7
-	add	r3,r3,r10
-	and	r11,r11,r5
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	vext.8	q12,q1,q2,#8
-	eor	r10,r4,r6
-	add	r7,r7,r9
-	ldr	r9,[sp,#52]
-	veor	q3,q3,q11
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	veor	q3,q3,q8
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vadd.i32	q13,q2,q14
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	veor	q12,q12,q3
-	ldr	r9,[sp,#56]
-	eor	r11,r10,r4
-	vshr.u32	q3,q12,#30
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	vst1.32	{q13},[r12,:128]!
-	add	r6,r6,r11
-	eor	r10,r7,r4
-	vsli.32	q3,q12,#2
-	add	r5,r5,r9
-	ldr	r9,[sp,#60]
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	ldr	r9,[sp,#0]
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vadd.i32	q13,q3,q14
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	vst1.32	{q13},[r12,:128]!
-	sub	r12,r12,#64
-	teq	r1,r2
-	sub	r8,r8,#16
-	it	eq
-	subeq	r1,r1,#64
-	vld1.8	{q0,q1},[r1]!
-	ldr	r9,[sp,#4]
-	eor	r11,r10,r6
-	vld1.8	{q2,q3},[r1]!
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	vld1.32	{d28[],d29[]},[r8,:32]!
-	add	r3,r3,r11
-	eor	r10,r4,r6
-	vrev32.8	q0,q0
-	add	r7,r7,r9
-	ldr	r9,[sp,#8]
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	ldr	r9,[sp,#12]
-	eor	r11,r10,r4
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	ldr	r9,[sp,#16]
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	vrev32.8	q1,q1
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	vadd.i32	q8,q0,q14
-	ldr	r9,[sp,#20]
-	eor	r11,r10,r7
-	vst1.32	{q8},[r12,:128]!
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	ldr	r9,[sp,#24]
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	eor	r10,r4,r6
-	add	r7,r7,r9
-	ldr	r9,[sp,#28]
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	ldr	r9,[sp,#32]
-	eor	r11,r10,r4
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vrev32.8	q2,q2
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	vadd.i32	q9,q1,q14
-	ldr	r9,[sp,#36]
-	eor	r11,r10,r3
-	vst1.32	{q9},[r12,:128]!
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	ldr	r9,[sp,#40]
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	ldr	r9,[sp,#44]
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	eor	r10,r4,r6
-	add	r7,r7,r9
-	ldr	r9,[sp,#48]
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vrev32.8	q3,q3
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	vadd.i32	q10,q2,q14
-	ldr	r9,[sp,#52]
-	eor	r11,r10,r4
-	vst1.32	{q10},[r12,:128]!
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	ldr	r9,[sp,#56]
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	ldr	r9,[sp,#60]
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	ldmia	r0,{r9,r10,r11,r12}	@ accumulate context
-	add	r3,r3,r9
-	ldr	r9,[r0,#16]
-	add	r4,r4,r10
-	add	r5,r5,r11
-	add	r6,r6,r12
-	it	eq
-	moveq	sp,r14
-	add	r7,r7,r9
-	it	ne
-	ldrne	r9,[sp]
-	stmia	r0,{r3,r4,r5,r6,r7}
-	itt	ne
-	addne	r12,sp,#3*16
-	bne	Loop_neon
-
-	@ vldmia	sp!,{d8-d15}
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-
-#endif
-#if __ARM_MAX_ARCH__>=7
-
-# if defined(__thumb2__)
-#  define INST(a,b,c,d)	.byte	c,d|0xf,a,b
-# else
-#  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
-# endif
-
-.globl	_sha1_block_data_order_hw
-.private_extern	_sha1_block_data_order_hw
-#ifdef __thumb2__
-.thumb_func	_sha1_block_data_order_hw
-#endif
-.align	5
-_sha1_block_data_order_hw:
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
-
-	veor	q1,q1,q1
-	adr	r3,LK_00_19
-	vld1.32	{q0},[r0]!
-	vld1.32	{d2[0]},[r0]
-	sub	r0,r0,#16
-	vld1.32	{d16[],d17[]},[r3,:32]!
-	vld1.32	{d18[],d19[]},[r3,:32]!
-	vld1.32	{d20[],d21[]},[r3,:32]!
-	vld1.32	{d22[],d23[]},[r3,:32]
-
-Loop_v8:
-	vld1.8	{q4,q5},[r1]!
-	vld1.8	{q6,q7},[r1]!
-	vrev32.8	q4,q4
-	vrev32.8	q5,q5
-
-	vadd.i32	q12,q8,q4
-	vrev32.8	q6,q6
-	vmov	q14,q0	@ offload
-	subs	r2,r2,#1
-
-	vadd.i32	q13,q8,q5
-	vrev32.8	q7,q7
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 0
-	INST(0x68,0x0c,0x02,0xe2)	@ sha1c q0,q1,q12
-	vadd.i32	q12,q8,q6
-	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 1
-	INST(0x6a,0x0c,0x06,0xe2)	@ sha1c q0,q3,q13
-	vadd.i32	q13,q8,q7
-	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
-	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 2
-	INST(0x68,0x0c,0x04,0xe2)	@ sha1c q0,q2,q12
-	vadd.i32	q12,q8,q4
-	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
-	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 3
-	INST(0x6a,0x0c,0x06,0xe2)	@ sha1c q0,q3,q13
-	vadd.i32	q13,q9,q5
-	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
-	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 4
-	INST(0x68,0x0c,0x04,0xe2)	@ sha1c q0,q2,q12
-	vadd.i32	q12,q9,q6
-	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
-	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 5
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-	vadd.i32	q13,q9,q7
-	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
-	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 6
-	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
-	vadd.i32	q12,q9,q4
-	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
-	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 7
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-	vadd.i32	q13,q9,q5
-	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
-	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 8
-	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
-	vadd.i32	q12,q10,q6
-	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
-	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 9
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-	vadd.i32	q13,q10,q7
-	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
-	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 10
-	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
-	vadd.i32	q12,q10,q4
-	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
-	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 11
-	INST(0x6a,0x0c,0x26,0xe2)	@ sha1m q0,q3,q13
-	vadd.i32	q13,q10,q5
-	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
-	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 12
-	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
-	vadd.i32	q12,q10,q6
-	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
-	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 13
-	INST(0x6a,0x0c,0x26,0xe2)	@ sha1m q0,q3,q13
-	vadd.i32	q13,q11,q7
-	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
-	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 14
-	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
-	vadd.i32	q12,q11,q4
-	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
-	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 15
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-	vadd.i32	q13,q11,q5
-	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
-	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 16
-	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
-	vadd.i32	q12,q11,q6
-	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 17
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-	vadd.i32	q13,q11,q7
-
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 18
-	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
-
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 19
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-
-	vadd.i32	q1,q1,q2
-	vadd.i32	q0,q0,q14
-	bne	Loop_v8
-
-	vst1.32	{q0},[r0]!
-	vst1.32	{d2[0]},[r0]
-
-	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
-	bx	lr					@ bx lr
-
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
-#endif  // defined(__arm__) && defined(__APPLE__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-windows.windows.x86.S
deleted file mode 100644
index d00f36c7..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-windows.windows.x86.S
+++ /dev/null
@@ -1,5608 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-global	_sha256_block_data_order_nohw
-align	16
-_sha256_block_data_order_nohw:
-L$_sha256_block_data_order_nohw_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	ebx,esp
-	call	L$000pic_point
-L$000pic_point:
-	pop	ebp
-	lea	ebp,[(L$K256-L$000pic_point)+ebp]
-	sub	esp,16
-	and	esp,-64
-	shl	eax,6
-	add	eax,edi
-	mov	DWORD [esp],esi
-	mov	DWORD [4+esp],edi
-	mov	DWORD [8+esp],eax
-	mov	DWORD [12+esp],ebx
-L$001no_xmm:
-	sub	eax,edi
-	cmp	eax,256
-	jae	NEAR L$002unrolled
-	jmp	NEAR L$003loop
-align	16
-L$003loop:
-	mov	eax,DWORD [edi]
-	mov	ebx,DWORD [4+edi]
-	mov	ecx,DWORD [8+edi]
-	bswap	eax
-	mov	edx,DWORD [12+edi]
-	bswap	ebx
-	push	eax
-	bswap	ecx
-	push	ebx
-	bswap	edx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [16+edi]
-	mov	ebx,DWORD [20+edi]
-	mov	ecx,DWORD [24+edi]
-	bswap	eax
-	mov	edx,DWORD [28+edi]
-	bswap	ebx
-	push	eax
-	bswap	ecx
-	push	ebx
-	bswap	edx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [32+edi]
-	mov	ebx,DWORD [36+edi]
-	mov	ecx,DWORD [40+edi]
-	bswap	eax
-	mov	edx,DWORD [44+edi]
-	bswap	ebx
-	push	eax
-	bswap	ecx
-	push	ebx
-	bswap	edx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [48+edi]
-	mov	ebx,DWORD [52+edi]
-	mov	ecx,DWORD [56+edi]
-	bswap	eax
-	mov	edx,DWORD [60+edi]
-	bswap	ebx
-	push	eax
-	bswap	ecx
-	push	ebx
-	bswap	edx
-	push	ecx
-	push	edx
-	add	edi,64
-	lea	esp,[esp-36]
-	mov	DWORD [104+esp],edi
-	mov	eax,DWORD [esi]
-	mov	ebx,DWORD [4+esi]
-	mov	ecx,DWORD [8+esi]
-	mov	edi,DWORD [12+esi]
-	mov	DWORD [8+esp],ebx
-	xor	ebx,ecx
-	mov	DWORD [12+esp],ecx
-	mov	DWORD [16+esp],edi
-	mov	DWORD [esp],ebx
-	mov	edx,DWORD [16+esi]
-	mov	ebx,DWORD [20+esi]
-	mov	ecx,DWORD [24+esi]
-	mov	edi,DWORD [28+esi]
-	mov	DWORD [24+esp],ebx
-	mov	DWORD [28+esp],ecx
-	mov	DWORD [32+esp],edi
-align	16
-L$00400_15:
-	mov	ecx,edx
-	mov	esi,DWORD [24+esp]
-	ror	ecx,14
-	mov	edi,DWORD [28+esp]
-	xor	ecx,edx
-	xor	esi,edi
-	mov	ebx,DWORD [96+esp]
-	ror	ecx,5
-	and	esi,edx
-	mov	DWORD [20+esp],edx
-	xor	edx,ecx
-	add	ebx,DWORD [32+esp]
-	xor	esi,edi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,esi
-	ror	ecx,9
-	add	ebx,edx
-	mov	edi,DWORD [8+esp]
-	xor	ecx,eax
-	mov	DWORD [4+esp],eax
-	lea	esp,[esp-4]
-	ror	ecx,11
-	mov	esi,DWORD [ebp]
-	xor	ecx,eax
-	mov	edx,DWORD [20+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	ebx,esi
-	mov	DWORD [esp],eax
-	add	edx,ebx
-	and	eax,DWORD [4+esp]
-	add	ebx,ecx
-	xor	eax,edi
-	add	ebp,4
-	add	eax,ebx
-	cmp	esi,3248222580
-	jne	NEAR L$00400_15
-	mov	ecx,DWORD [156+esp]
-	jmp	NEAR L$00516_63
-align	16
-L$00516_63:
-	mov	ebx,ecx
-	mov	esi,DWORD [104+esp]
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [160+esp]
-	shr	edi,10
-	add	ebx,DWORD [124+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [24+esp]
-	ror	ecx,14
-	add	ebx,edi
-	mov	edi,DWORD [28+esp]
-	xor	ecx,edx
-	xor	esi,edi
-	mov	DWORD [96+esp],ebx
-	ror	ecx,5
-	and	esi,edx
-	mov	DWORD [20+esp],edx
-	xor	edx,ecx
-	add	ebx,DWORD [32+esp]
-	xor	esi,edi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,esi
-	ror	ecx,9
-	add	ebx,edx
-	mov	edi,DWORD [8+esp]
-	xor	ecx,eax
-	mov	DWORD [4+esp],eax
-	lea	esp,[esp-4]
-	ror	ecx,11
-	mov	esi,DWORD [ebp]
-	xor	ecx,eax
-	mov	edx,DWORD [20+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	ebx,esi
-	mov	DWORD [esp],eax
-	add	edx,ebx
-	and	eax,DWORD [4+esp]
-	add	ebx,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [156+esp]
-	add	ebp,4
-	add	eax,ebx
-	cmp	esi,3329325298
-	jne	NEAR L$00516_63
-	mov	esi,DWORD [356+esp]
-	mov	ebx,DWORD [8+esp]
-	mov	ecx,DWORD [16+esp]
-	add	eax,DWORD [esi]
-	add	ebx,DWORD [4+esi]
-	add	edi,DWORD [8+esi]
-	add	ecx,DWORD [12+esi]
-	mov	DWORD [esi],eax
-	mov	DWORD [4+esi],ebx
-	mov	DWORD [8+esi],edi
-	mov	DWORD [12+esi],ecx
-	mov	eax,DWORD [24+esp]
-	mov	ebx,DWORD [28+esp]
-	mov	ecx,DWORD [32+esp]
-	mov	edi,DWORD [360+esp]
-	add	edx,DWORD [16+esi]
-	add	eax,DWORD [20+esi]
-	add	ebx,DWORD [24+esi]
-	add	ecx,DWORD [28+esi]
-	mov	DWORD [16+esi],edx
-	mov	DWORD [20+esi],eax
-	mov	DWORD [24+esi],ebx
-	mov	DWORD [28+esi],ecx
-	lea	esp,[356+esp]
-	sub	ebp,256
-	cmp	edi,DWORD [8+esp]
-	jb	NEAR L$003loop
-	mov	esp,DWORD [12+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-align	64
-L$K256:
-dd	1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
-dd	66051,67438087,134810123,202182159
-db	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
-db	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-db	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-db	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-db	62,0
-align	16
-L$002unrolled:
-	lea	esp,[esp-96]
-	mov	eax,DWORD [esi]
-	mov	ebp,DWORD [4+esi]
-	mov	ecx,DWORD [8+esi]
-	mov	ebx,DWORD [12+esi]
-	mov	DWORD [4+esp],ebp
-	xor	ebp,ecx
-	mov	DWORD [8+esp],ecx
-	mov	DWORD [12+esp],ebx
-	mov	edx,DWORD [16+esi]
-	mov	ebx,DWORD [20+esi]
-	mov	ecx,DWORD [24+esi]
-	mov	esi,DWORD [28+esi]
-	mov	DWORD [20+esp],ebx
-	mov	DWORD [24+esp],ecx
-	mov	DWORD [28+esp],esi
-	jmp	NEAR L$006grand_loop
-align	16
-L$006grand_loop:
-	mov	ebx,DWORD [edi]
-	mov	ecx,DWORD [4+edi]
-	bswap	ebx
-	mov	esi,DWORD [8+edi]
-	bswap	ecx
-	mov	DWORD [32+esp],ebx
-	bswap	esi
-	mov	DWORD [36+esp],ecx
-	mov	DWORD [40+esp],esi
-	mov	ebx,DWORD [12+edi]
-	mov	ecx,DWORD [16+edi]
-	bswap	ebx
-	mov	esi,DWORD [20+edi]
-	bswap	ecx
-	mov	DWORD [44+esp],ebx
-	bswap	esi
-	mov	DWORD [48+esp],ecx
-	mov	DWORD [52+esp],esi
-	mov	ebx,DWORD [24+edi]
-	mov	ecx,DWORD [28+edi]
-	bswap	ebx
-	mov	esi,DWORD [32+edi]
-	bswap	ecx
-	mov	DWORD [56+esp],ebx
-	bswap	esi
-	mov	DWORD [60+esp],ecx
-	mov	DWORD [64+esp],esi
-	mov	ebx,DWORD [36+edi]
-	mov	ecx,DWORD [40+edi]
-	bswap	ebx
-	mov	esi,DWORD [44+edi]
-	bswap	ecx
-	mov	DWORD [68+esp],ebx
-	bswap	esi
-	mov	DWORD [72+esp],ecx
-	mov	DWORD [76+esp],esi
-	mov	ebx,DWORD [48+edi]
-	mov	ecx,DWORD [52+edi]
-	bswap	ebx
-	mov	esi,DWORD [56+edi]
-	bswap	ecx
-	mov	DWORD [80+esp],ebx
-	bswap	esi
-	mov	DWORD [84+esp],ecx
-	mov	DWORD [88+esp],esi
-	mov	ebx,DWORD [60+edi]
-	add	edi,64
-	bswap	ebx
-	mov	DWORD [100+esp],edi
-	mov	DWORD [92+esp],ebx
-	mov	ecx,edx
-	mov	esi,DWORD [20+esp]
-	ror	edx,14
-	mov	edi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	ebx,DWORD [32+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [28+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [4+esp]
-	xor	ecx,eax
-	mov	DWORD [esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[1116352408+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [12+esp]
-	add	ebp,ecx
-	mov	esi,edx
-	mov	ecx,DWORD [16+esp]
-	ror	edx,14
-	mov	edi,DWORD [20+esp]
-	xor	edx,esi
-	mov	ebx,DWORD [36+esp]
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [12+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [24+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [esp]
-	xor	esi,ebp
-	mov	DWORD [28+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[1899447441+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,esi
-	mov	ecx,edx
-	mov	esi,DWORD [12+esp]
-	ror	edx,14
-	mov	edi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	ebx,DWORD [40+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [20+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [28+esp]
-	xor	ecx,eax
-	mov	DWORD [24+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[3049323471+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [4+esp]
-	add	ebp,ecx
-	mov	esi,edx
-	mov	ecx,DWORD [8+esp]
-	ror	edx,14
-	mov	edi,DWORD [12+esp]
-	xor	edx,esi
-	mov	ebx,DWORD [44+esp]
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [4+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [16+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [24+esp]
-	xor	esi,ebp
-	mov	DWORD [20+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[3921009573+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,esi
-	mov	ecx,edx
-	mov	esi,DWORD [4+esp]
-	ror	edx,14
-	mov	edi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	ebx,DWORD [48+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [12+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [20+esp]
-	xor	ecx,eax
-	mov	DWORD [16+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[961987163+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [28+esp]
-	add	ebp,ecx
-	mov	esi,edx
-	mov	ecx,DWORD [esp]
-	ror	edx,14
-	mov	edi,DWORD [4+esp]
-	xor	edx,esi
-	mov	ebx,DWORD [52+esp]
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [28+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [8+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [16+esp]
-	xor	esi,ebp
-	mov	DWORD [12+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[1508970993+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,esi
-	mov	ecx,edx
-	mov	esi,DWORD [28+esp]
-	ror	edx,14
-	mov	edi,DWORD [esp]
-	xor	edx,ecx
-	mov	ebx,DWORD [56+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [4+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [12+esp]
-	xor	ecx,eax
-	mov	DWORD [8+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[2453635748+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [20+esp]
-	add	ebp,ecx
-	mov	esi,edx
-	mov	ecx,DWORD [24+esp]
-	ror	edx,14
-	mov	edi,DWORD [28+esp]
-	xor	edx,esi
-	mov	ebx,DWORD [60+esp]
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [20+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [8+esp]
-	xor	esi,ebp
-	mov	DWORD [4+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[2870763221+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,esi
-	mov	ecx,edx
-	mov	esi,DWORD [20+esp]
-	ror	edx,14
-	mov	edi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	ebx,DWORD [64+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [28+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [4+esp]
-	xor	ecx,eax
-	mov	DWORD [esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[3624381080+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [12+esp]
-	add	ebp,ecx
-	mov	esi,edx
-	mov	ecx,DWORD [16+esp]
-	ror	edx,14
-	mov	edi,DWORD [20+esp]
-	xor	edx,esi
-	mov	ebx,DWORD [68+esp]
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [12+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [24+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [esp]
-	xor	esi,ebp
-	mov	DWORD [28+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[310598401+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,esi
-	mov	ecx,edx
-	mov	esi,DWORD [12+esp]
-	ror	edx,14
-	mov	edi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	ebx,DWORD [72+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [20+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [28+esp]
-	xor	ecx,eax
-	mov	DWORD [24+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[607225278+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [4+esp]
-	add	ebp,ecx
-	mov	esi,edx
-	mov	ecx,DWORD [8+esp]
-	ror	edx,14
-	mov	edi,DWORD [12+esp]
-	xor	edx,esi
-	mov	ebx,DWORD [76+esp]
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [4+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [16+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [24+esp]
-	xor	esi,ebp
-	mov	DWORD [20+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[1426881987+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,esi
-	mov	ecx,edx
-	mov	esi,DWORD [4+esp]
-	ror	edx,14
-	mov	edi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	ebx,DWORD [80+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [12+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [20+esp]
-	xor	ecx,eax
-	mov	DWORD [16+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[1925078388+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [28+esp]
-	add	ebp,ecx
-	mov	esi,edx
-	mov	ecx,DWORD [esp]
-	ror	edx,14
-	mov	edi,DWORD [4+esp]
-	xor	edx,esi
-	mov	ebx,DWORD [84+esp]
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [28+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [8+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [16+esp]
-	xor	esi,ebp
-	mov	DWORD [12+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[2162078206+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,esi
-	mov	ecx,edx
-	mov	esi,DWORD [28+esp]
-	ror	edx,14
-	mov	edi,DWORD [esp]
-	xor	edx,ecx
-	mov	ebx,DWORD [88+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [4+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [12+esp]
-	xor	ecx,eax
-	mov	DWORD [8+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[2614888103+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [20+esp]
-	add	ebp,ecx
-	mov	esi,edx
-	mov	ecx,DWORD [24+esp]
-	ror	edx,14
-	mov	edi,DWORD [28+esp]
-	xor	edx,esi
-	mov	ebx,DWORD [92+esp]
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [20+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [8+esp]
-	xor	esi,ebp
-	mov	DWORD [4+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[3248222580+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [36+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,esi
-	mov	esi,DWORD [88+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [32+esp]
-	shr	edi,10
-	add	ebx,DWORD [68+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [20+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	DWORD [32+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [28+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [4+esp]
-	xor	ecx,eax
-	mov	DWORD [esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[3835390401+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [40+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [12+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [92+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [36+esp]
-	shr	edi,10
-	add	ebx,DWORD [72+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [16+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [20+esp]
-	xor	edx,esi
-	mov	DWORD [36+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [12+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [24+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [esp]
-	xor	esi,ebp
-	mov	DWORD [28+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[4022224774+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [44+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,esi
-	mov	esi,DWORD [32+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [40+esp]
-	shr	edi,10
-	add	ebx,DWORD [76+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [12+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	DWORD [40+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [20+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [28+esp]
-	xor	ecx,eax
-	mov	DWORD [24+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[264347078+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [48+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [4+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [36+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [44+esp]
-	shr	edi,10
-	add	ebx,DWORD [80+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [8+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [12+esp]
-	xor	edx,esi
-	mov	DWORD [44+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [4+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [16+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [24+esp]
-	xor	esi,ebp
-	mov	DWORD [20+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[604807628+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [52+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,esi
-	mov	esi,DWORD [40+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [48+esp]
-	shr	edi,10
-	add	ebx,DWORD [84+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [4+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	DWORD [48+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [12+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [20+esp]
-	xor	ecx,eax
-	mov	DWORD [16+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[770255983+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [56+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [28+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [44+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [52+esp]
-	shr	edi,10
-	add	ebx,DWORD [88+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [4+esp]
-	xor	edx,esi
-	mov	DWORD [52+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [28+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [8+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [16+esp]
-	xor	esi,ebp
-	mov	DWORD [12+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[1249150122+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [60+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,esi
-	mov	esi,DWORD [48+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [56+esp]
-	shr	edi,10
-	add	ebx,DWORD [92+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [28+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [esp]
-	xor	edx,ecx
-	mov	DWORD [56+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [4+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [12+esp]
-	xor	ecx,eax
-	mov	DWORD [8+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[1555081692+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [64+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [20+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [52+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [60+esp]
-	shr	edi,10
-	add	ebx,DWORD [32+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [24+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [28+esp]
-	xor	edx,esi
-	mov	DWORD [60+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [20+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [8+esp]
-	xor	esi,ebp
-	mov	DWORD [4+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[1996064986+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [68+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,esi
-	mov	esi,DWORD [56+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [64+esp]
-	shr	edi,10
-	add	ebx,DWORD [36+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [20+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	DWORD [64+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [28+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [4+esp]
-	xor	ecx,eax
-	mov	DWORD [esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[2554220882+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [72+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [12+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [60+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [68+esp]
-	shr	edi,10
-	add	ebx,DWORD [40+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [16+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [20+esp]
-	xor	edx,esi
-	mov	DWORD [68+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [12+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [24+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [esp]
-	xor	esi,ebp
-	mov	DWORD [28+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[2821834349+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [76+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,esi
-	mov	esi,DWORD [64+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [72+esp]
-	shr	edi,10
-	add	ebx,DWORD [44+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [12+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	DWORD [72+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [20+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [28+esp]
-	xor	ecx,eax
-	mov	DWORD [24+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[2952996808+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [80+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [4+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [68+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [76+esp]
-	shr	edi,10
-	add	ebx,DWORD [48+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [8+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [12+esp]
-	xor	edx,esi
-	mov	DWORD [76+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [4+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [16+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [24+esp]
-	xor	esi,ebp
-	mov	DWORD [20+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[3210313671+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [84+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,esi
-	mov	esi,DWORD [72+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [80+esp]
-	shr	edi,10
-	add	ebx,DWORD [52+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [4+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	DWORD [80+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [12+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [20+esp]
-	xor	ecx,eax
-	mov	DWORD [16+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[3336571891+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [88+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [28+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [76+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [84+esp]
-	shr	edi,10
-	add	ebx,DWORD [56+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [4+esp]
-	xor	edx,esi
-	mov	DWORD [84+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [28+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [8+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [16+esp]
-	xor	esi,ebp
-	mov	DWORD [12+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[3584528711+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [92+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,esi
-	mov	esi,DWORD [80+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [88+esp]
-	shr	edi,10
-	add	ebx,DWORD [60+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [28+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [esp]
-	xor	edx,ecx
-	mov	DWORD [88+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [4+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [12+esp]
-	xor	ecx,eax
-	mov	DWORD [8+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[113926993+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [32+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [20+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [84+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [92+esp]
-	shr	edi,10
-	add	ebx,DWORD [64+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [24+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [28+esp]
-	xor	edx,esi
-	mov	DWORD [92+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [20+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [8+esp]
-	xor	esi,ebp
-	mov	DWORD [4+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[338241895+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [36+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,esi
-	mov	esi,DWORD [88+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [32+esp]
-	shr	edi,10
-	add	ebx,DWORD [68+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [20+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	DWORD [32+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [28+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [4+esp]
-	xor	ecx,eax
-	mov	DWORD [esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[666307205+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [40+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [12+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [92+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [36+esp]
-	shr	edi,10
-	add	ebx,DWORD [72+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [16+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [20+esp]
-	xor	edx,esi
-	mov	DWORD [36+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [12+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [24+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [esp]
-	xor	esi,ebp
-	mov	DWORD [28+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[773529912+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [44+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,esi
-	mov	esi,DWORD [32+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [40+esp]
-	shr	edi,10
-	add	ebx,DWORD [76+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [12+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	DWORD [40+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [20+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [28+esp]
-	xor	ecx,eax
-	mov	DWORD [24+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[1294757372+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [48+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [4+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [36+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [44+esp]
-	shr	edi,10
-	add	ebx,DWORD [80+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [8+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [12+esp]
-	xor	edx,esi
-	mov	DWORD [44+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [4+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [16+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [24+esp]
-	xor	esi,ebp
-	mov	DWORD [20+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[1396182291+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [52+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,esi
-	mov	esi,DWORD [40+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [48+esp]
-	shr	edi,10
-	add	ebx,DWORD [84+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [4+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	DWORD [48+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [12+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [20+esp]
-	xor	ecx,eax
-	mov	DWORD [16+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[1695183700+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [56+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [28+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [44+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [52+esp]
-	shr	edi,10
-	add	ebx,DWORD [88+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [4+esp]
-	xor	edx,esi
-	mov	DWORD [52+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [28+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [8+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [16+esp]
-	xor	esi,ebp
-	mov	DWORD [12+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[1986661051+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [60+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,esi
-	mov	esi,DWORD [48+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [56+esp]
-	shr	edi,10
-	add	ebx,DWORD [92+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [28+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [esp]
-	xor	edx,ecx
-	mov	DWORD [56+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [4+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [12+esp]
-	xor	ecx,eax
-	mov	DWORD [8+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[2177026350+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [64+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [20+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [52+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [60+esp]
-	shr	edi,10
-	add	ebx,DWORD [32+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [24+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [28+esp]
-	xor	edx,esi
-	mov	DWORD [60+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [20+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [8+esp]
-	xor	esi,ebp
-	mov	DWORD [4+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[2456956037+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [68+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,esi
-	mov	esi,DWORD [56+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [64+esp]
-	shr	edi,10
-	add	ebx,DWORD [36+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [20+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	DWORD [64+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [28+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [4+esp]
-	xor	ecx,eax
-	mov	DWORD [esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[2730485921+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [72+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [12+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [60+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [68+esp]
-	shr	edi,10
-	add	ebx,DWORD [40+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [16+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [20+esp]
-	xor	edx,esi
-	mov	DWORD [68+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [12+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [24+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [esp]
-	xor	esi,ebp
-	mov	DWORD [28+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[2820302411+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [76+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,esi
-	mov	esi,DWORD [64+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [72+esp]
-	shr	edi,10
-	add	ebx,DWORD [44+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [12+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	DWORD [72+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [20+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [28+esp]
-	xor	ecx,eax
-	mov	DWORD [24+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[3259730800+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [80+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [4+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [68+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [76+esp]
-	shr	edi,10
-	add	ebx,DWORD [48+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [8+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [12+esp]
-	xor	edx,esi
-	mov	DWORD [76+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [4+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [16+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [24+esp]
-	xor	esi,ebp
-	mov	DWORD [20+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[3345764771+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [84+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,esi
-	mov	esi,DWORD [72+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [80+esp]
-	shr	edi,10
-	add	ebx,DWORD [52+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [4+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	DWORD [80+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [12+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [20+esp]
-	xor	ecx,eax
-	mov	DWORD [16+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[3516065817+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [88+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [28+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [76+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [84+esp]
-	shr	edi,10
-	add	ebx,DWORD [56+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [4+esp]
-	xor	edx,esi
-	mov	DWORD [84+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [28+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [8+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [16+esp]
-	xor	esi,ebp
-	mov	DWORD [12+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[3600352804+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [92+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,esi
-	mov	esi,DWORD [80+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [88+esp]
-	shr	edi,10
-	add	ebx,DWORD [60+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [28+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [esp]
-	xor	edx,ecx
-	mov	DWORD [88+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [4+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [12+esp]
-	xor	ecx,eax
-	mov	DWORD [8+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[4094571909+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [32+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [20+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [84+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [92+esp]
-	shr	edi,10
-	add	ebx,DWORD [64+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [24+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [28+esp]
-	xor	edx,esi
-	mov	DWORD [92+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [20+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [8+esp]
-	xor	esi,ebp
-	mov	DWORD [4+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[275423344+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [36+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,esi
-	mov	esi,DWORD [88+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [32+esp]
-	shr	edi,10
-	add	ebx,DWORD [68+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [20+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	DWORD [32+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [28+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [4+esp]
-	xor	ecx,eax
-	mov	DWORD [esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[430227734+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [40+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [12+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [92+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [36+esp]
-	shr	edi,10
-	add	ebx,DWORD [72+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [16+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [20+esp]
-	xor	edx,esi
-	mov	DWORD [36+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [12+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [24+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [esp]
-	xor	esi,ebp
-	mov	DWORD [28+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[506948616+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [44+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,esi
-	mov	esi,DWORD [32+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [40+esp]
-	shr	edi,10
-	add	ebx,DWORD [76+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [12+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	DWORD [40+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [20+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [28+esp]
-	xor	ecx,eax
-	mov	DWORD [24+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[659060556+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [48+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [4+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [36+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [44+esp]
-	shr	edi,10
-	add	ebx,DWORD [80+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [8+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [12+esp]
-	xor	edx,esi
-	mov	DWORD [44+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [4+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [16+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [24+esp]
-	xor	esi,ebp
-	mov	DWORD [20+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[883997877+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [52+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,esi
-	mov	esi,DWORD [40+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [48+esp]
-	shr	edi,10
-	add	ebx,DWORD [84+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [4+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	DWORD [48+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [12+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [20+esp]
-	xor	ecx,eax
-	mov	DWORD [16+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[958139571+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [56+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [28+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [44+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [52+esp]
-	shr	edi,10
-	add	ebx,DWORD [88+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [4+esp]
-	xor	edx,esi
-	mov	DWORD [52+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [28+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [8+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [16+esp]
-	xor	esi,ebp
-	mov	DWORD [12+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[1322822218+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [60+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,esi
-	mov	esi,DWORD [48+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [56+esp]
-	shr	edi,10
-	add	ebx,DWORD [92+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [28+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [esp]
-	xor	edx,ecx
-	mov	DWORD [56+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [4+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [12+esp]
-	xor	ecx,eax
-	mov	DWORD [8+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[1537002063+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [64+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [20+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [52+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [60+esp]
-	shr	edi,10
-	add	ebx,DWORD [32+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [24+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [28+esp]
-	xor	edx,esi
-	mov	DWORD [60+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [20+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [8+esp]
-	xor	esi,ebp
-	mov	DWORD [4+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[1747873779+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [68+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,esi
-	mov	esi,DWORD [56+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [64+esp]
-	shr	edi,10
-	add	ebx,DWORD [36+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [20+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	DWORD [64+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [28+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [4+esp]
-	xor	ecx,eax
-	mov	DWORD [esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[1955562222+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [72+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [12+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [60+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [68+esp]
-	shr	edi,10
-	add	ebx,DWORD [40+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [16+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [20+esp]
-	xor	edx,esi
-	mov	DWORD [68+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [12+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [24+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [esp]
-	xor	esi,ebp
-	mov	DWORD [28+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[2024104815+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [76+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,esi
-	mov	esi,DWORD [64+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [72+esp]
-	shr	edi,10
-	add	ebx,DWORD [44+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [12+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	DWORD [72+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [20+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [28+esp]
-	xor	ecx,eax
-	mov	DWORD [24+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[2227730452+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [80+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [4+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [68+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [76+esp]
-	shr	edi,10
-	add	ebx,DWORD [48+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [8+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [12+esp]
-	xor	edx,esi
-	mov	DWORD [76+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [4+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [16+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [24+esp]
-	xor	esi,ebp
-	mov	DWORD [20+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[2361852424+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [84+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,esi
-	mov	esi,DWORD [72+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [80+esp]
-	shr	edi,10
-	add	ebx,DWORD [52+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [4+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	DWORD [80+esp],ebx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [12+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [20+esp]
-	xor	ecx,eax
-	mov	DWORD [16+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[2428436474+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [88+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [28+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [76+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [84+esp]
-	shr	edi,10
-	add	ebx,DWORD [56+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [4+esp]
-	xor	edx,esi
-	mov	DWORD [84+esp],ebx
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [28+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [8+esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [16+esp]
-	xor	esi,ebp
-	mov	DWORD [12+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[2756734187+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	mov	ecx,DWORD [92+esp]
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,esi
-	mov	esi,DWORD [80+esp]
-	mov	ebx,ecx
-	ror	ecx,11
-	mov	edi,esi
-	ror	esi,2
-	xor	ecx,ebx
-	shr	ebx,3
-	ror	ecx,7
-	xor	esi,edi
-	xor	ebx,ecx
-	ror	esi,17
-	add	ebx,DWORD [88+esp]
-	shr	edi,10
-	add	ebx,DWORD [60+esp]
-	mov	ecx,edx
-	xor	edi,esi
-	mov	esi,DWORD [28+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [esp]
-	xor	edx,ecx
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	add	ebx,DWORD [4+esp]
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	ebx,edi
-	ror	ecx,9
-	mov	esi,eax
-	mov	edi,DWORD [12+esp]
-	xor	ecx,eax
-	mov	DWORD [8+esp],eax
-	xor	eax,edi
-	ror	ecx,11
-	and	ebp,eax
-	lea	edx,[3204031479+edx*1+ebx]
-	xor	ecx,esi
-	xor	ebp,edi
-	mov	esi,DWORD [32+esp]
-	ror	ecx,2
-	add	ebp,edx
-	add	edx,DWORD [20+esp]
-	add	ebp,ecx
-	mov	ecx,DWORD [84+esp]
-	mov	ebx,esi
-	ror	esi,11
-	mov	edi,ecx
-	ror	ecx,2
-	xor	esi,ebx
-	shr	ebx,3
-	ror	esi,7
-	xor	ecx,edi
-	xor	ebx,esi
-	ror	ecx,17
-	add	ebx,DWORD [92+esp]
-	shr	edi,10
-	add	ebx,DWORD [64+esp]
-	mov	esi,edx
-	xor	edi,ecx
-	mov	ecx,DWORD [24+esp]
-	ror	edx,14
-	add	ebx,edi
-	mov	edi,DWORD [28+esp]
-	xor	edx,esi
-	xor	ecx,edi
-	ror	edx,5
-	and	ecx,esi
-	mov	DWORD [20+esp],esi
-	xor	edx,esi
-	add	ebx,DWORD [esp]
-	xor	edi,ecx
-	ror	edx,6
-	mov	esi,ebp
-	add	ebx,edi
-	ror	esi,9
-	mov	ecx,ebp
-	mov	edi,DWORD [8+esp]
-	xor	esi,ebp
-	mov	DWORD [4+esp],ebp
-	xor	ebp,edi
-	ror	esi,11
-	and	eax,ebp
-	lea	edx,[3329325298+edx*1+ebx]
-	xor	esi,ecx
-	xor	eax,edi
-	ror	esi,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,esi
-	mov	esi,DWORD [96+esp]
-	xor	ebp,edi
-	mov	ecx,DWORD [12+esp]
-	add	eax,DWORD [esi]
-	add	ebp,DWORD [4+esi]
-	add	edi,DWORD [8+esi]
-	add	ecx,DWORD [12+esi]
-	mov	DWORD [esi],eax
-	mov	DWORD [4+esi],ebp
-	mov	DWORD [8+esi],edi
-	mov	DWORD [12+esi],ecx
-	mov	DWORD [4+esp],ebp
-	xor	ebp,edi
-	mov	DWORD [8+esp],edi
-	mov	DWORD [12+esp],ecx
-	mov	edi,DWORD [20+esp]
-	mov	ebx,DWORD [24+esp]
-	mov	ecx,DWORD [28+esp]
-	add	edx,DWORD [16+esi]
-	add	edi,DWORD [20+esi]
-	add	ebx,DWORD [24+esi]
-	add	ecx,DWORD [28+esi]
-	mov	DWORD [16+esi],edx
-	mov	DWORD [20+esi],edi
-	mov	DWORD [24+esi],ebx
-	mov	DWORD [28+esi],ecx
-	mov	DWORD [20+esp],edi
-	mov	edi,DWORD [100+esp]
-	mov	DWORD [24+esp],ebx
-	mov	DWORD [28+esp],ecx
-	cmp	edi,DWORD [104+esp]
-	jb	NEAR L$006grand_loop
-	mov	esp,DWORD [108+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_sha256_block_data_order_ssse3
-align	16
-_sha256_block_data_order_ssse3:
-L$_sha256_block_data_order_ssse3_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	ebx,esp
-	call	L$007pic_point
-L$007pic_point:
-	pop	ebp
-	lea	ebp,[(L$K256-L$007pic_point)+ebp]
-	sub	esp,16
-	and	esp,-64
-	shl	eax,6
-	add	eax,edi
-	mov	DWORD [esp],esi
-	mov	DWORD [4+esp],edi
-	mov	DWORD [8+esp],eax
-	mov	DWORD [12+esp],ebx
-	lea	esp,[esp-96]
-	mov	eax,DWORD [esi]
-	mov	ebx,DWORD [4+esi]
-	mov	ecx,DWORD [8+esi]
-	mov	edi,DWORD [12+esi]
-	mov	DWORD [4+esp],ebx
-	xor	ebx,ecx
-	mov	DWORD [8+esp],ecx
-	mov	DWORD [12+esp],edi
-	mov	edx,DWORD [16+esi]
-	mov	edi,DWORD [20+esi]
-	mov	ecx,DWORD [24+esi]
-	mov	esi,DWORD [28+esi]
-	mov	DWORD [20+esp],edi
-	mov	edi,DWORD [100+esp]
-	mov	DWORD [24+esp],ecx
-	mov	DWORD [28+esp],esi
-	movdqa	xmm7,[256+ebp]
-	jmp	NEAR L$008grand_ssse3
-align	16
-L$008grand_ssse3:
-	movdqu	xmm0,[edi]
-	movdqu	xmm1,[16+edi]
-	movdqu	xmm2,[32+edi]
-	movdqu	xmm3,[48+edi]
-	add	edi,64
-db	102,15,56,0,199
-	mov	DWORD [100+esp],edi
-db	102,15,56,0,207
-	movdqa	xmm4,[ebp]
-db	102,15,56,0,215
-	movdqa	xmm5,[16+ebp]
-	paddd	xmm4,xmm0
-db	102,15,56,0,223
-	movdqa	xmm6,[32+ebp]
-	paddd	xmm5,xmm1
-	movdqa	xmm7,[48+ebp]
-	movdqa	[32+esp],xmm4
-	paddd	xmm6,xmm2
-	movdqa	[48+esp],xmm5
-	paddd	xmm7,xmm3
-	movdqa	[64+esp],xmm6
-	movdqa	[80+esp],xmm7
-	jmp	NEAR L$009ssse3_00_47
-align	16
-L$009ssse3_00_47:
-	add	ebp,64
-	mov	ecx,edx
-	movdqa	xmm4,xmm1
-	ror	edx,14
-	mov	esi,DWORD [20+esp]
-	movdqa	xmm7,xmm3
-	xor	edx,ecx
-	mov	edi,DWORD [24+esp]
-db	102,15,58,15,224,4
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-db	102,15,58,15,250,4
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	movdqa	xmm5,xmm4
-	ror	edx,6
-	mov	ecx,eax
-	movdqa	xmm6,xmm4
-	add	edx,edi
-	mov	edi,DWORD [4+esp]
-	psrld	xmm4,3
-	mov	esi,eax
-	ror	ecx,9
-	paddd	xmm0,xmm7
-	mov	DWORD [esp],eax
-	xor	ecx,eax
-	psrld	xmm6,7
-	xor	eax,edi
-	add	edx,DWORD [28+esp]
-	ror	ecx,11
-	and	ebx,eax
-	pshufd	xmm7,xmm3,250
-	xor	ecx,esi
-	add	edx,DWORD [32+esp]
-	pslld	xmm5,14
-	xor	ebx,edi
-	ror	ecx,2
-	pxor	xmm4,xmm6
-	add	ebx,edx
-	add	edx,DWORD [12+esp]
-	psrld	xmm6,11
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	pxor	xmm4,xmm5
-	mov	esi,DWORD [16+esp]
-	xor	edx,ecx
-	pslld	xmm5,11
-	mov	edi,DWORD [20+esp]
-	xor	esi,edi
-	ror	edx,5
-	pxor	xmm4,xmm6
-	and	esi,ecx
-	mov	DWORD [12+esp],ecx
-	movdqa	xmm6,xmm7
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	pxor	xmm4,xmm5
-	mov	ecx,ebx
-	add	edx,edi
-	psrld	xmm7,10
-	mov	edi,DWORD [esp]
-	mov	esi,ebx
-	ror	ecx,9
-	paddd	xmm0,xmm4
-	mov	DWORD [28+esp],ebx
-	xor	ecx,ebx
-	psrlq	xmm6,17
-	xor	ebx,edi
-	add	edx,DWORD [24+esp]
-	ror	ecx,11
-	pxor	xmm7,xmm6
-	and	eax,ebx
-	xor	ecx,esi
-	psrlq	xmm6,2
-	add	edx,DWORD [36+esp]
-	xor	eax,edi
-	ror	ecx,2
-	pxor	xmm7,xmm6
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	pshufd	xmm7,xmm7,128
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [12+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [16+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	psrldq	xmm7,8
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	paddd	xmm0,xmm7
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [28+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [24+esp],eax
-	pshufd	xmm7,xmm0,80
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [20+esp]
-	movdqa	xmm6,xmm7
-	ror	ecx,11
-	psrld	xmm7,10
-	and	ebx,eax
-	psrlq	xmm6,17
-	xor	ecx,esi
-	add	edx,DWORD [40+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	pxor	xmm7,xmm6
-	add	ebx,edx
-	add	edx,DWORD [4+esp]
-	psrlq	xmm6,2
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	pxor	xmm7,xmm6
-	mov	esi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [12+esp]
-	pshufd	xmm7,xmm7,8
-	xor	esi,edi
-	ror	edx,5
-	movdqa	xmm6,[ebp]
-	and	esi,ecx
-	mov	DWORD [4+esp],ecx
-	pslldq	xmm7,8
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [24+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	paddd	xmm0,xmm7
-	mov	DWORD [20+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [16+esp]
-	paddd	xmm6,xmm0
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [44+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,ecx
-	movdqa	[32+esp],xmm6
-	mov	ecx,edx
-	movdqa	xmm4,xmm2
-	ror	edx,14
-	mov	esi,DWORD [4+esp]
-	movdqa	xmm7,xmm0
-	xor	edx,ecx
-	mov	edi,DWORD [8+esp]
-db	102,15,58,15,225,4
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-db	102,15,58,15,251,4
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	movdqa	xmm5,xmm4
-	ror	edx,6
-	mov	ecx,eax
-	movdqa	xmm6,xmm4
-	add	edx,edi
-	mov	edi,DWORD [20+esp]
-	psrld	xmm4,3
-	mov	esi,eax
-	ror	ecx,9
-	paddd	xmm1,xmm7
-	mov	DWORD [16+esp],eax
-	xor	ecx,eax
-	psrld	xmm6,7
-	xor	eax,edi
-	add	edx,DWORD [12+esp]
-	ror	ecx,11
-	and	ebx,eax
-	pshufd	xmm7,xmm0,250
-	xor	ecx,esi
-	add	edx,DWORD [48+esp]
-	pslld	xmm5,14
-	xor	ebx,edi
-	ror	ecx,2
-	pxor	xmm4,xmm6
-	add	ebx,edx
-	add	edx,DWORD [28+esp]
-	psrld	xmm6,11
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	pxor	xmm4,xmm5
-	mov	esi,DWORD [esp]
-	xor	edx,ecx
-	pslld	xmm5,11
-	mov	edi,DWORD [4+esp]
-	xor	esi,edi
-	ror	edx,5
-	pxor	xmm4,xmm6
-	and	esi,ecx
-	mov	DWORD [28+esp],ecx
-	movdqa	xmm6,xmm7
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	pxor	xmm4,xmm5
-	mov	ecx,ebx
-	add	edx,edi
-	psrld	xmm7,10
-	mov	edi,DWORD [16+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	paddd	xmm1,xmm4
-	mov	DWORD [12+esp],ebx
-	xor	ecx,ebx
-	psrlq	xmm6,17
-	xor	ebx,edi
-	add	edx,DWORD [8+esp]
-	ror	ecx,11
-	pxor	xmm7,xmm6
-	and	eax,ebx
-	xor	ecx,esi
-	psrlq	xmm6,2
-	add	edx,DWORD [52+esp]
-	xor	eax,edi
-	ror	ecx,2
-	pxor	xmm7,xmm6
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	pshufd	xmm7,xmm7,128
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [28+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	psrldq	xmm7,8
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	paddd	xmm1,xmm7
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [12+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [8+esp],eax
-	pshufd	xmm7,xmm1,80
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [4+esp]
-	movdqa	xmm6,xmm7
-	ror	ecx,11
-	psrld	xmm7,10
-	and	ebx,eax
-	psrlq	xmm6,17
-	xor	ecx,esi
-	add	edx,DWORD [56+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	pxor	xmm7,xmm6
-	add	ebx,edx
-	add	edx,DWORD [20+esp]
-	psrlq	xmm6,2
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	pxor	xmm7,xmm6
-	mov	esi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [28+esp]
-	pshufd	xmm7,xmm7,8
-	xor	esi,edi
-	ror	edx,5
-	movdqa	xmm6,[16+ebp]
-	and	esi,ecx
-	mov	DWORD [20+esp],ecx
-	pslldq	xmm7,8
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [8+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	paddd	xmm1,xmm7
-	mov	DWORD [4+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [esp]
-	paddd	xmm6,xmm1
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [60+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,ecx
-	movdqa	[48+esp],xmm6
-	mov	ecx,edx
-	movdqa	xmm4,xmm3
-	ror	edx,14
-	mov	esi,DWORD [20+esp]
-	movdqa	xmm7,xmm1
-	xor	edx,ecx
-	mov	edi,DWORD [24+esp]
-db	102,15,58,15,226,4
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-db	102,15,58,15,248,4
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	movdqa	xmm5,xmm4
-	ror	edx,6
-	mov	ecx,eax
-	movdqa	xmm6,xmm4
-	add	edx,edi
-	mov	edi,DWORD [4+esp]
-	psrld	xmm4,3
-	mov	esi,eax
-	ror	ecx,9
-	paddd	xmm2,xmm7
-	mov	DWORD [esp],eax
-	xor	ecx,eax
-	psrld	xmm6,7
-	xor	eax,edi
-	add	edx,DWORD [28+esp]
-	ror	ecx,11
-	and	ebx,eax
-	pshufd	xmm7,xmm1,250
-	xor	ecx,esi
-	add	edx,DWORD [64+esp]
-	pslld	xmm5,14
-	xor	ebx,edi
-	ror	ecx,2
-	pxor	xmm4,xmm6
-	add	ebx,edx
-	add	edx,DWORD [12+esp]
-	psrld	xmm6,11
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	pxor	xmm4,xmm5
-	mov	esi,DWORD [16+esp]
-	xor	edx,ecx
-	pslld	xmm5,11
-	mov	edi,DWORD [20+esp]
-	xor	esi,edi
-	ror	edx,5
-	pxor	xmm4,xmm6
-	and	esi,ecx
-	mov	DWORD [12+esp],ecx
-	movdqa	xmm6,xmm7
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	pxor	xmm4,xmm5
-	mov	ecx,ebx
-	add	edx,edi
-	psrld	xmm7,10
-	mov	edi,DWORD [esp]
-	mov	esi,ebx
-	ror	ecx,9
-	paddd	xmm2,xmm4
-	mov	DWORD [28+esp],ebx
-	xor	ecx,ebx
-	psrlq	xmm6,17
-	xor	ebx,edi
-	add	edx,DWORD [24+esp]
-	ror	ecx,11
-	pxor	xmm7,xmm6
-	and	eax,ebx
-	xor	ecx,esi
-	psrlq	xmm6,2
-	add	edx,DWORD [68+esp]
-	xor	eax,edi
-	ror	ecx,2
-	pxor	xmm7,xmm6
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	pshufd	xmm7,xmm7,128
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [12+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [16+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	psrldq	xmm7,8
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	paddd	xmm2,xmm7
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [28+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [24+esp],eax
-	pshufd	xmm7,xmm2,80
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [20+esp]
-	movdqa	xmm6,xmm7
-	ror	ecx,11
-	psrld	xmm7,10
-	and	ebx,eax
-	psrlq	xmm6,17
-	xor	ecx,esi
-	add	edx,DWORD [72+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	pxor	xmm7,xmm6
-	add	ebx,edx
-	add	edx,DWORD [4+esp]
-	psrlq	xmm6,2
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	pxor	xmm7,xmm6
-	mov	esi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [12+esp]
-	pshufd	xmm7,xmm7,8
-	xor	esi,edi
-	ror	edx,5
-	movdqa	xmm6,[32+ebp]
-	and	esi,ecx
-	mov	DWORD [4+esp],ecx
-	pslldq	xmm7,8
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [24+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	paddd	xmm2,xmm7
-	mov	DWORD [20+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [16+esp]
-	paddd	xmm6,xmm2
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [76+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,ecx
-	movdqa	[64+esp],xmm6
-	mov	ecx,edx
-	movdqa	xmm4,xmm0
-	ror	edx,14
-	mov	esi,DWORD [4+esp]
-	movdqa	xmm7,xmm2
-	xor	edx,ecx
-	mov	edi,DWORD [8+esp]
-db	102,15,58,15,227,4
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-db	102,15,58,15,249,4
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	movdqa	xmm5,xmm4
-	ror	edx,6
-	mov	ecx,eax
-	movdqa	xmm6,xmm4
-	add	edx,edi
-	mov	edi,DWORD [20+esp]
-	psrld	xmm4,3
-	mov	esi,eax
-	ror	ecx,9
-	paddd	xmm3,xmm7
-	mov	DWORD [16+esp],eax
-	xor	ecx,eax
-	psrld	xmm6,7
-	xor	eax,edi
-	add	edx,DWORD [12+esp]
-	ror	ecx,11
-	and	ebx,eax
-	pshufd	xmm7,xmm2,250
-	xor	ecx,esi
-	add	edx,DWORD [80+esp]
-	pslld	xmm5,14
-	xor	ebx,edi
-	ror	ecx,2
-	pxor	xmm4,xmm6
-	add	ebx,edx
-	add	edx,DWORD [28+esp]
-	psrld	xmm6,11
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	pxor	xmm4,xmm5
-	mov	esi,DWORD [esp]
-	xor	edx,ecx
-	pslld	xmm5,11
-	mov	edi,DWORD [4+esp]
-	xor	esi,edi
-	ror	edx,5
-	pxor	xmm4,xmm6
-	and	esi,ecx
-	mov	DWORD [28+esp],ecx
-	movdqa	xmm6,xmm7
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	pxor	xmm4,xmm5
-	mov	ecx,ebx
-	add	edx,edi
-	psrld	xmm7,10
-	mov	edi,DWORD [16+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	paddd	xmm3,xmm4
-	mov	DWORD [12+esp],ebx
-	xor	ecx,ebx
-	psrlq	xmm6,17
-	xor	ebx,edi
-	add	edx,DWORD [8+esp]
-	ror	ecx,11
-	pxor	xmm7,xmm6
-	and	eax,ebx
-	xor	ecx,esi
-	psrlq	xmm6,2
-	add	edx,DWORD [84+esp]
-	xor	eax,edi
-	ror	ecx,2
-	pxor	xmm7,xmm6
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	pshufd	xmm7,xmm7,128
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [28+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	psrldq	xmm7,8
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	paddd	xmm3,xmm7
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [12+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [8+esp],eax
-	pshufd	xmm7,xmm3,80
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [4+esp]
-	movdqa	xmm6,xmm7
-	ror	ecx,11
-	psrld	xmm7,10
-	and	ebx,eax
-	psrlq	xmm6,17
-	xor	ecx,esi
-	add	edx,DWORD [88+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	pxor	xmm7,xmm6
-	add	ebx,edx
-	add	edx,DWORD [20+esp]
-	psrlq	xmm6,2
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	pxor	xmm7,xmm6
-	mov	esi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [28+esp]
-	pshufd	xmm7,xmm7,8
-	xor	esi,edi
-	ror	edx,5
-	movdqa	xmm6,[48+ebp]
-	and	esi,ecx
-	mov	DWORD [20+esp],ecx
-	pslldq	xmm7,8
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [8+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	paddd	xmm3,xmm7
-	mov	DWORD [4+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [esp]
-	paddd	xmm6,xmm3
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [92+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,ecx
-	movdqa	[80+esp],xmm6
-	cmp	DWORD [64+ebp],66051
-	jne	NEAR L$009ssse3_00_47
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [20+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [24+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [4+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [28+esp]
-	ror	ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [32+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	add	ebx,edx
-	add	edx,DWORD [12+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [20+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [12+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [esp]
-	mov	esi,ebx
-	ror	ecx,9
-	mov	DWORD [28+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [24+esp]
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [36+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [12+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [16+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [28+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [24+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [20+esp]
-	ror	ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [40+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	add	ebx,edx
-	add	edx,DWORD [4+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [12+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [4+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [24+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	mov	DWORD [20+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [16+esp]
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [44+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [4+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [8+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [20+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [16+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [12+esp]
-	ror	ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [48+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	add	ebx,edx
-	add	edx,DWORD [28+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [esp]
-	xor	edx,ecx
-	mov	edi,DWORD [4+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [28+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [16+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	mov	DWORD [12+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [8+esp]
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [52+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [28+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [12+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [8+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [4+esp]
-	ror	ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [56+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	add	ebx,edx
-	add	edx,DWORD [20+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [28+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [20+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [8+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	mov	DWORD [4+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [esp]
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [60+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [20+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [24+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [4+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [28+esp]
-	ror	ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [64+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	add	ebx,edx
-	add	edx,DWORD [12+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [20+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [12+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [esp]
-	mov	esi,ebx
-	ror	ecx,9
-	mov	DWORD [28+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [24+esp]
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [68+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [12+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [16+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [28+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [24+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [20+esp]
-	ror	ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [72+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	add	ebx,edx
-	add	edx,DWORD [4+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [12+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [4+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [24+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	mov	DWORD [20+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [16+esp]
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [76+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [4+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [8+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [20+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [16+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [12+esp]
-	ror	ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [80+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	add	ebx,edx
-	add	edx,DWORD [28+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [esp]
-	xor	edx,ecx
-	mov	edi,DWORD [4+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [28+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [16+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	mov	DWORD [12+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [8+esp]
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [84+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [28+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [12+esp]
-	mov	esi,eax
-	ror	ecx,9
-	mov	DWORD [8+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [4+esp]
-	ror	ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [88+esp]
-	xor	ebx,edi
-	ror	ecx,2
-	add	ebx,edx
-	add	edx,DWORD [20+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	ror	edx,14
-	mov	esi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [28+esp]
-	xor	esi,edi
-	ror	edx,5
-	and	esi,ecx
-	mov	DWORD [20+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	ror	edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [8+esp]
-	mov	esi,ebx
-	ror	ecx,9
-	mov	DWORD [4+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [esp]
-	ror	ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [92+esp]
-	xor	eax,edi
-	ror	ecx,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,ecx
-	mov	esi,DWORD [96+esp]
-	xor	ebx,edi
-	mov	ecx,DWORD [12+esp]
-	add	eax,DWORD [esi]
-	add	ebx,DWORD [4+esi]
-	add	edi,DWORD [8+esi]
-	add	ecx,DWORD [12+esi]
-	mov	DWORD [esi],eax
-	mov	DWORD [4+esi],ebx
-	mov	DWORD [8+esi],edi
-	mov	DWORD [12+esi],ecx
-	mov	DWORD [4+esp],ebx
-	xor	ebx,edi
-	mov	DWORD [8+esp],edi
-	mov	DWORD [12+esp],ecx
-	mov	edi,DWORD [20+esp]
-	mov	ecx,DWORD [24+esp]
-	add	edx,DWORD [16+esi]
-	add	edi,DWORD [20+esi]
-	add	ecx,DWORD [24+esi]
-	mov	DWORD [16+esi],edx
-	mov	DWORD [20+esi],edi
-	mov	DWORD [20+esp],edi
-	mov	edi,DWORD [28+esp]
-	mov	DWORD [24+esi],ecx
-	add	edi,DWORD [28+esi]
-	mov	DWORD [24+esp],ecx
-	mov	DWORD [28+esi],edi
-	mov	DWORD [28+esp],edi
-	mov	edi,DWORD [100+esp]
-	movdqa	xmm7,[64+ebp]
-	sub	ebp,192
-	cmp	edi,DWORD [104+esp]
-	jb	NEAR L$008grand_ssse3
-	mov	esp,DWORD [108+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_sha256_block_data_order_avx
-align	16
-_sha256_block_data_order_avx:
-L$_sha256_block_data_order_avx_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	ebx,esp
-	call	L$010pic_point
-L$010pic_point:
-	pop	ebp
-	lea	ebp,[(L$K256-L$010pic_point)+ebp]
-	sub	esp,16
-	and	esp,-64
-	shl	eax,6
-	add	eax,edi
-	mov	DWORD [esp],esi
-	mov	DWORD [4+esp],edi
-	mov	DWORD [8+esp],eax
-	mov	DWORD [12+esp],ebx
-	lea	esp,[esp-96]
-	vzeroall
-	mov	eax,DWORD [esi]
-	mov	ebx,DWORD [4+esi]
-	mov	ecx,DWORD [8+esi]
-	mov	edi,DWORD [12+esi]
-	mov	DWORD [4+esp],ebx
-	xor	ebx,ecx
-	mov	DWORD [8+esp],ecx
-	mov	DWORD [12+esp],edi
-	mov	edx,DWORD [16+esi]
-	mov	edi,DWORD [20+esi]
-	mov	ecx,DWORD [24+esi]
-	mov	esi,DWORD [28+esi]
-	mov	DWORD [20+esp],edi
-	mov	edi,DWORD [100+esp]
-	mov	DWORD [24+esp],ecx
-	mov	DWORD [28+esp],esi
-	vmovdqa	xmm7,[256+ebp]
-	jmp	NEAR L$011grand_avx
-align	32
-L$011grand_avx:
-	vmovdqu	xmm0,[edi]
-	vmovdqu	xmm1,[16+edi]
-	vmovdqu	xmm2,[32+edi]
-	vmovdqu	xmm3,[48+edi]
-	add	edi,64
-	vpshufb	xmm0,xmm0,xmm7
-	mov	DWORD [100+esp],edi
-	vpshufb	xmm1,xmm1,xmm7
-	vpshufb	xmm2,xmm2,xmm7
-	vpaddd	xmm4,xmm0,[ebp]
-	vpshufb	xmm3,xmm3,xmm7
-	vpaddd	xmm5,xmm1,[16+ebp]
-	vpaddd	xmm6,xmm2,[32+ebp]
-	vpaddd	xmm7,xmm3,[48+ebp]
-	vmovdqa	[32+esp],xmm4
-	vmovdqa	[48+esp],xmm5
-	vmovdqa	[64+esp],xmm6
-	vmovdqa	[80+esp],xmm7
-	jmp	NEAR L$012avx_00_47
-align	16
-L$012avx_00_47:
-	add	ebp,64
-	vpalignr	xmm4,xmm1,xmm0,4
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [20+esp]
-	vpalignr	xmm7,xmm3,xmm2,4
-	xor	edx,ecx
-	mov	edi,DWORD [24+esp]
-	xor	esi,edi
-	vpsrld	xmm6,xmm4,7
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	vpaddd	xmm0,xmm0,xmm7
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrld	xmm7,xmm4,3
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [4+esp]
-	vpslld	xmm5,xmm4,14
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [esp],eax
-	vpxor	xmm4,xmm7,xmm6
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [28+esp]
-	vpshufd	xmm7,xmm3,250
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	vpsrld	xmm6,xmm6,11
-	add	edx,DWORD [32+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	vpxor	xmm4,xmm4,xmm5
-	add	ebx,edx
-	add	edx,DWORD [12+esp]
-	add	ebx,ecx
-	vpslld	xmm5,xmm5,11
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [16+esp]
-	vpxor	xmm4,xmm4,xmm6
-	xor	edx,ecx
-	mov	edi,DWORD [20+esp]
-	xor	esi,edi
-	vpsrld	xmm6,xmm7,10
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [12+esp],ecx
-	vpxor	xmm4,xmm4,xmm5
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrlq	xmm5,xmm7,17
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [esp]
-	vpaddd	xmm0,xmm0,xmm4
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [28+esp],ebx
-	vpxor	xmm6,xmm6,xmm5
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [24+esp]
-	vpsrlq	xmm7,xmm7,19
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	vpxor	xmm6,xmm6,xmm7
-	add	edx,DWORD [36+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	vpshufd	xmm7,xmm6,132
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,ecx
-	vpsrldq	xmm7,xmm7,8
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [12+esp]
-	vpaddd	xmm0,xmm0,xmm7
-	xor	edx,ecx
-	mov	edi,DWORD [16+esp]
-	xor	esi,edi
-	vpshufd	xmm7,xmm0,80
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	vpsrld	xmm6,xmm7,10
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrlq	xmm5,xmm7,17
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [28+esp]
-	vpxor	xmm6,xmm6,xmm5
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [24+esp],eax
-	vpsrlq	xmm7,xmm7,19
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [20+esp]
-	vpxor	xmm6,xmm6,xmm7
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	vpshufd	xmm7,xmm6,232
-	add	edx,DWORD [40+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	vpslldq	xmm7,xmm7,8
-	add	ebx,edx
-	add	edx,DWORD [4+esp]
-	add	ebx,ecx
-	vpaddd	xmm0,xmm0,xmm7
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [8+esp]
-	vpaddd	xmm6,xmm0,[ebp]
-	xor	edx,ecx
-	mov	edi,DWORD [12+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [4+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [24+esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [20+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [16+esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [44+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,ecx
-	vmovdqa	[32+esp],xmm6
-	vpalignr	xmm4,xmm2,xmm1,4
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [4+esp]
-	vpalignr	xmm7,xmm0,xmm3,4
-	xor	edx,ecx
-	mov	edi,DWORD [8+esp]
-	xor	esi,edi
-	vpsrld	xmm6,xmm4,7
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	vpaddd	xmm1,xmm1,xmm7
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrld	xmm7,xmm4,3
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [20+esp]
-	vpslld	xmm5,xmm4,14
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [16+esp],eax
-	vpxor	xmm4,xmm7,xmm6
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [12+esp]
-	vpshufd	xmm7,xmm0,250
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	vpsrld	xmm6,xmm6,11
-	add	edx,DWORD [48+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	vpxor	xmm4,xmm4,xmm5
-	add	ebx,edx
-	add	edx,DWORD [28+esp]
-	add	ebx,ecx
-	vpslld	xmm5,xmm5,11
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [esp]
-	vpxor	xmm4,xmm4,xmm6
-	xor	edx,ecx
-	mov	edi,DWORD [4+esp]
-	xor	esi,edi
-	vpsrld	xmm6,xmm7,10
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [28+esp],ecx
-	vpxor	xmm4,xmm4,xmm5
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrlq	xmm5,xmm7,17
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [16+esp]
-	vpaddd	xmm1,xmm1,xmm4
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [12+esp],ebx
-	vpxor	xmm6,xmm6,xmm5
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [8+esp]
-	vpsrlq	xmm7,xmm7,19
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	vpxor	xmm6,xmm6,xmm7
-	add	edx,DWORD [52+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	vpshufd	xmm7,xmm6,132
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,ecx
-	vpsrldq	xmm7,xmm7,8
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [28+esp]
-	vpaddd	xmm1,xmm1,xmm7
-	xor	edx,ecx
-	mov	edi,DWORD [esp]
-	xor	esi,edi
-	vpshufd	xmm7,xmm1,80
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	vpsrld	xmm6,xmm7,10
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrlq	xmm5,xmm7,17
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [12+esp]
-	vpxor	xmm6,xmm6,xmm5
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [8+esp],eax
-	vpsrlq	xmm7,xmm7,19
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [4+esp]
-	vpxor	xmm6,xmm6,xmm7
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	vpshufd	xmm7,xmm6,232
-	add	edx,DWORD [56+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	vpslldq	xmm7,xmm7,8
-	add	ebx,edx
-	add	edx,DWORD [20+esp]
-	add	ebx,ecx
-	vpaddd	xmm1,xmm1,xmm7
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [24+esp]
-	vpaddd	xmm6,xmm1,[16+ebp]
-	xor	edx,ecx
-	mov	edi,DWORD [28+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [20+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [8+esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [4+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [60+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,ecx
-	vmovdqa	[48+esp],xmm6
-	vpalignr	xmm4,xmm3,xmm2,4
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [20+esp]
-	vpalignr	xmm7,xmm1,xmm0,4
-	xor	edx,ecx
-	mov	edi,DWORD [24+esp]
-	xor	esi,edi
-	vpsrld	xmm6,xmm4,7
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	vpaddd	xmm2,xmm2,xmm7
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrld	xmm7,xmm4,3
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [4+esp]
-	vpslld	xmm5,xmm4,14
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [esp],eax
-	vpxor	xmm4,xmm7,xmm6
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [28+esp]
-	vpshufd	xmm7,xmm1,250
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	vpsrld	xmm6,xmm6,11
-	add	edx,DWORD [64+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	vpxor	xmm4,xmm4,xmm5
-	add	ebx,edx
-	add	edx,DWORD [12+esp]
-	add	ebx,ecx
-	vpslld	xmm5,xmm5,11
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [16+esp]
-	vpxor	xmm4,xmm4,xmm6
-	xor	edx,ecx
-	mov	edi,DWORD [20+esp]
-	xor	esi,edi
-	vpsrld	xmm6,xmm7,10
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [12+esp],ecx
-	vpxor	xmm4,xmm4,xmm5
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrlq	xmm5,xmm7,17
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [esp]
-	vpaddd	xmm2,xmm2,xmm4
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [28+esp],ebx
-	vpxor	xmm6,xmm6,xmm5
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [24+esp]
-	vpsrlq	xmm7,xmm7,19
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	vpxor	xmm6,xmm6,xmm7
-	add	edx,DWORD [68+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	vpshufd	xmm7,xmm6,132
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,ecx
-	vpsrldq	xmm7,xmm7,8
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [12+esp]
-	vpaddd	xmm2,xmm2,xmm7
-	xor	edx,ecx
-	mov	edi,DWORD [16+esp]
-	xor	esi,edi
-	vpshufd	xmm7,xmm2,80
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	vpsrld	xmm6,xmm7,10
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrlq	xmm5,xmm7,17
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [28+esp]
-	vpxor	xmm6,xmm6,xmm5
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [24+esp],eax
-	vpsrlq	xmm7,xmm7,19
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [20+esp]
-	vpxor	xmm6,xmm6,xmm7
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	vpshufd	xmm7,xmm6,232
-	add	edx,DWORD [72+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	vpslldq	xmm7,xmm7,8
-	add	ebx,edx
-	add	edx,DWORD [4+esp]
-	add	ebx,ecx
-	vpaddd	xmm2,xmm2,xmm7
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [8+esp]
-	vpaddd	xmm6,xmm2,[32+ebp]
-	xor	edx,ecx
-	mov	edi,DWORD [12+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [4+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [24+esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [20+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [16+esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [76+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,ecx
-	vmovdqa	[64+esp],xmm6
-	vpalignr	xmm4,xmm0,xmm3,4
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [4+esp]
-	vpalignr	xmm7,xmm2,xmm1,4
-	xor	edx,ecx
-	mov	edi,DWORD [8+esp]
-	xor	esi,edi
-	vpsrld	xmm6,xmm4,7
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	vpaddd	xmm3,xmm3,xmm7
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrld	xmm7,xmm4,3
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [20+esp]
-	vpslld	xmm5,xmm4,14
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [16+esp],eax
-	vpxor	xmm4,xmm7,xmm6
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [12+esp]
-	vpshufd	xmm7,xmm2,250
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	vpsrld	xmm6,xmm6,11
-	add	edx,DWORD [80+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	vpxor	xmm4,xmm4,xmm5
-	add	ebx,edx
-	add	edx,DWORD [28+esp]
-	add	ebx,ecx
-	vpslld	xmm5,xmm5,11
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [esp]
-	vpxor	xmm4,xmm4,xmm6
-	xor	edx,ecx
-	mov	edi,DWORD [4+esp]
-	xor	esi,edi
-	vpsrld	xmm6,xmm7,10
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [28+esp],ecx
-	vpxor	xmm4,xmm4,xmm5
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrlq	xmm5,xmm7,17
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [16+esp]
-	vpaddd	xmm3,xmm3,xmm4
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [12+esp],ebx
-	vpxor	xmm6,xmm6,xmm5
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [8+esp]
-	vpsrlq	xmm7,xmm7,19
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	vpxor	xmm6,xmm6,xmm7
-	add	edx,DWORD [84+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	vpshufd	xmm7,xmm6,132
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,ecx
-	vpsrldq	xmm7,xmm7,8
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [28+esp]
-	vpaddd	xmm3,xmm3,xmm7
-	xor	edx,ecx
-	mov	edi,DWORD [esp]
-	xor	esi,edi
-	vpshufd	xmm7,xmm3,80
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	vpsrld	xmm6,xmm7,10
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	vpsrlq	xmm5,xmm7,17
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [12+esp]
-	vpxor	xmm6,xmm6,xmm5
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [8+esp],eax
-	vpsrlq	xmm7,xmm7,19
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [4+esp]
-	vpxor	xmm6,xmm6,xmm7
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	vpshufd	xmm7,xmm6,232
-	add	edx,DWORD [88+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	vpslldq	xmm7,xmm7,8
-	add	ebx,edx
-	add	edx,DWORD [20+esp]
-	add	ebx,ecx
-	vpaddd	xmm3,xmm3,xmm7
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [24+esp]
-	vpaddd	xmm6,xmm3,[48+ebp]
-	xor	edx,ecx
-	mov	edi,DWORD [28+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [20+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [8+esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [4+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [92+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,ecx
-	vmovdqa	[80+esp],xmm6
-	cmp	DWORD [64+ebp],66051
-	jne	NEAR L$012avx_00_47
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [20+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [24+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [4+esp]
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [28+esp]
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [32+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	add	ebx,edx
-	add	edx,DWORD [12+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [20+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [12+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [28+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [24+esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [36+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [12+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [16+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [28+esp]
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [24+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [20+esp]
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [40+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	add	ebx,edx
-	add	edx,DWORD [4+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [12+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [4+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [24+esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [20+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [16+esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [44+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [4+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [8+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [20+esp]
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [16+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [12+esp]
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [48+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	add	ebx,edx
-	add	edx,DWORD [28+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [esp]
-	xor	edx,ecx
-	mov	edi,DWORD [4+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [28+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [16+esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [12+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [8+esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [52+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [28+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [12+esp]
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [8+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [4+esp]
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [56+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	add	ebx,edx
-	add	edx,DWORD [20+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [28+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [20+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [8+esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [4+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [60+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [20+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [24+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [16+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [4+esp]
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [28+esp]
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [64+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	add	ebx,edx
-	add	edx,DWORD [12+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [16+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [20+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [12+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [28+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [24+esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [68+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [8+esp]
-	add	eax,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [12+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [16+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [8+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [28+esp]
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [24+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [20+esp]
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [72+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	add	ebx,edx
-	add	edx,DWORD [4+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [8+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [12+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [4+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [24+esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [20+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [16+esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [76+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [esp]
-	add	eax,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [4+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [8+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [20+esp]
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [16+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [12+esp]
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [80+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	add	ebx,edx
-	add	edx,DWORD [28+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [esp]
-	xor	edx,ecx
-	mov	edi,DWORD [4+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [28+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [16+esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [12+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [8+esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [84+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [24+esp]
-	add	eax,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [28+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [24+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,eax
-	add	edx,edi
-	mov	edi,DWORD [12+esp]
-	mov	esi,eax
-	shrd	ecx,ecx,9
-	mov	DWORD [8+esp],eax
-	xor	ecx,eax
-	xor	eax,edi
-	add	edx,DWORD [4+esp]
-	shrd	ecx,ecx,11
-	and	ebx,eax
-	xor	ecx,esi
-	add	edx,DWORD [88+esp]
-	xor	ebx,edi
-	shrd	ecx,ecx,2
-	add	ebx,edx
-	add	edx,DWORD [20+esp]
-	add	ebx,ecx
-	mov	ecx,edx
-	shrd	edx,edx,14
-	mov	esi,DWORD [24+esp]
-	xor	edx,ecx
-	mov	edi,DWORD [28+esp]
-	xor	esi,edi
-	shrd	edx,edx,5
-	and	esi,ecx
-	mov	DWORD [20+esp],ecx
-	xor	edx,ecx
-	xor	edi,esi
-	shrd	edx,edx,6
-	mov	ecx,ebx
-	add	edx,edi
-	mov	edi,DWORD [8+esp]
-	mov	esi,ebx
-	shrd	ecx,ecx,9
-	mov	DWORD [4+esp],ebx
-	xor	ecx,ebx
-	xor	ebx,edi
-	add	edx,DWORD [esp]
-	shrd	ecx,ecx,11
-	and	eax,ebx
-	xor	ecx,esi
-	add	edx,DWORD [92+esp]
-	xor	eax,edi
-	shrd	ecx,ecx,2
-	add	eax,edx
-	add	edx,DWORD [16+esp]
-	add	eax,ecx
-	mov	esi,DWORD [96+esp]
-	xor	ebx,edi
-	mov	ecx,DWORD [12+esp]
-	add	eax,DWORD [esi]
-	add	ebx,DWORD [4+esi]
-	add	edi,DWORD [8+esi]
-	add	ecx,DWORD [12+esi]
-	mov	DWORD [esi],eax
-	mov	DWORD [4+esi],ebx
-	mov	DWORD [8+esi],edi
-	mov	DWORD [12+esi],ecx
-	mov	DWORD [4+esp],ebx
-	xor	ebx,edi
-	mov	DWORD [8+esp],edi
-	mov	DWORD [12+esp],ecx
-	mov	edi,DWORD [20+esp]
-	mov	ecx,DWORD [24+esp]
-	add	edx,DWORD [16+esi]
-	add	edi,DWORD [20+esi]
-	add	ecx,DWORD [24+esi]
-	mov	DWORD [16+esi],edx
-	mov	DWORD [20+esi],edi
-	mov	DWORD [20+esp],edi
-	mov	edi,DWORD [28+esp]
-	mov	DWORD [24+esi],ecx
-	add	edi,DWORD [28+esi]
-	mov	DWORD [24+esp],ecx
-	mov	DWORD [28+esi],edi
-	mov	DWORD [28+esp],edi
-	mov	edi,DWORD [100+esp]
-	vmovdqa	xmm7,[64+ebp]
-	sub	ebp,192
-	cmp	edi,DWORD [104+esp]
-	jb	NEAR L$011grand_avx
-	mov	esp,DWORD [108+esp]
-	vzeroall
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-ios.ios.arm.S
deleted file mode 100644
index b82984e3..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-ios.ios.arm.S
+++ /dev/null
@@ -1,2852 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__APPLE__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
-@
-@ Licensed under the OpenSSL license (the "License").  You may not use
-@ this file except in compliance with the License.  You can obtain a copy
-@ in the file LICENSE in the source distribution or at
-@ https://www.openssl.org/source/license.html
-
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@
-@ Permission to use under GPL terms is granted.
-@ ====================================================================
-
-@ SHA256 block procedure for ARMv4. May 2007.
-
-@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
-@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-@ byte [on single-issue Xscale PXA250 core].
-
-@ July 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
-@ Cortex A8 core and ~20 cycles per processed byte.
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 16%
-@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
-
-@ September 2013.
-@
-@ Add NEON implementation. On Cortex A8 it was measured to process one
-@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
-@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
-@ code (meaning that latter performs sub-optimally, nothing was done
-@ about it).
-
-@ May 2014.
-@
-@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-
-#ifndef __KERNEL__
-# include <CCryptoBoringSSL_arm_arch.h>
-#else
-# define __ARM_ARCH __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
-@ instructions are manually-encoded. (See unsha256.)
-
-
-.text
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#endif
-
-
-.align	5
-K256:
-.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.word	0				@ terminator
-.align	5
-
-.globl	_sha256_block_data_order_nohw
-.private_extern	_sha256_block_data_order_nohw
-#ifdef __thumb2__
-.thumb_func	_sha256_block_data_order_nohw
-#endif
-_sha256_block_data_order_nohw:
-	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
-	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
-	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
-	adr	r14,K256
-	sub	sp,sp,#16*4		@ alloca(X[16])
-Loop:
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r5,r6		@ magic
-	eor	r12,r12,r12
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 0
-# if 0==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r8,r8,ror#5
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 0
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 0==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r8,r8,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r11,r11,r2			@ h+=X[i]
-	str	r2,[sp,#0*4]
-	eor	r2,r9,r10
-	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r8
-	add	r11,r11,r12			@ h+=K256[i]
-	eor	r2,r2,r10			@ Ch(e,f,g)
-	eor	r0,r4,r4,ror#11
-	add	r11,r11,r2			@ h+=Ch(e,f,g)
-#if 0==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 0<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r4,r5			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
-	eor	r12,r4,r5			@ a^b, b^c in next round
-	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r4,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r7,r7,r11			@ d+=h
-	eor	r3,r3,r5			@ Maj(a,b,c)
-	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 1
-# if 1==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r7,r7,ror#5
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 1
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 1==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r7,r7,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r10,r10,r2			@ h+=X[i]
-	str	r2,[sp,#1*4]
-	eor	r2,r8,r9
-	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r7
-	add	r10,r10,r3			@ h+=K256[i]
-	eor	r2,r2,r9			@ Ch(e,f,g)
-	eor	r0,r11,r11,ror#11
-	add	r10,r10,r2			@ h+=Ch(e,f,g)
-#if 1==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 1<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r11,r4			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
-	eor	r3,r11,r4			@ a^b, b^c in next round
-	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r11,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r6,r6,r10			@ d+=h
-	eor	r12,r12,r4			@ Maj(a,b,c)
-	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 2
-# if 2==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 2
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 2==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r6,r6,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r9,r9,r2			@ h+=X[i]
-	str	r2,[sp,#2*4]
-	eor	r2,r7,r8
-	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r6
-	add	r9,r9,r12			@ h+=K256[i]
-	eor	r2,r2,r8			@ Ch(e,f,g)
-	eor	r0,r10,r10,ror#11
-	add	r9,r9,r2			@ h+=Ch(e,f,g)
-#if 2==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 2<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r10,r11			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
-	eor	r12,r10,r11			@ a^b, b^c in next round
-	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r10,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r5,r5,r9			@ d+=h
-	eor	r3,r3,r11			@ Maj(a,b,c)
-	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 3
-# if 3==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r5,r5,ror#5
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 3
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 3==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r5,r5,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r8,r8,r2			@ h+=X[i]
-	str	r2,[sp,#3*4]
-	eor	r2,r6,r7
-	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r5
-	add	r8,r8,r3			@ h+=K256[i]
-	eor	r2,r2,r7			@ Ch(e,f,g)
-	eor	r0,r9,r9,ror#11
-	add	r8,r8,r2			@ h+=Ch(e,f,g)
-#if 3==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 3<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r9,r10			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
-	eor	r3,r9,r10			@ a^b, b^c in next round
-	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r9,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r4,r4,r8			@ d+=h
-	eor	r12,r12,r10			@ Maj(a,b,c)
-	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 4
-# if 4==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r4,r4,ror#5
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 4
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 4==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r4,r4,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r7,r7,r2			@ h+=X[i]
-	str	r2,[sp,#4*4]
-	eor	r2,r5,r6
-	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r4
-	add	r7,r7,r12			@ h+=K256[i]
-	eor	r2,r2,r6			@ Ch(e,f,g)
-	eor	r0,r8,r8,ror#11
-	add	r7,r7,r2			@ h+=Ch(e,f,g)
-#if 4==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 4<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r8,r9			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
-	eor	r12,r8,r9			@ a^b, b^c in next round
-	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r8,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r11,r11,r7			@ d+=h
-	eor	r3,r3,r9			@ Maj(a,b,c)
-	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 5
-# if 5==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r11,r11,ror#5
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 5
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 5==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r11,r11,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r6,r6,r2			@ h+=X[i]
-	str	r2,[sp,#5*4]
-	eor	r2,r4,r5
-	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r11
-	add	r6,r6,r3			@ h+=K256[i]
-	eor	r2,r2,r5			@ Ch(e,f,g)
-	eor	r0,r7,r7,ror#11
-	add	r6,r6,r2			@ h+=Ch(e,f,g)
-#if 5==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 5<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r7,r8			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
-	eor	r3,r7,r8			@ a^b, b^c in next round
-	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r7,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r10,r10,r6			@ d+=h
-	eor	r12,r12,r8			@ Maj(a,b,c)
-	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 6
-# if 6==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 6
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 6==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r10,r10,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r5,r5,r2			@ h+=X[i]
-	str	r2,[sp,#6*4]
-	eor	r2,r11,r4
-	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r10
-	add	r5,r5,r12			@ h+=K256[i]
-	eor	r2,r2,r4			@ Ch(e,f,g)
-	eor	r0,r6,r6,ror#11
-	add	r5,r5,r2			@ h+=Ch(e,f,g)
-#if 6==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 6<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r6,r7			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
-	eor	r12,r6,r7			@ a^b, b^c in next round
-	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r6,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r9,r9,r5			@ d+=h
-	eor	r3,r3,r7			@ Maj(a,b,c)
-	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 7
-# if 7==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r9,r9,ror#5
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 7
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 7==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r9,r9,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r4,r4,r2			@ h+=X[i]
-	str	r2,[sp,#7*4]
-	eor	r2,r10,r11
-	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r9
-	add	r4,r4,r3			@ h+=K256[i]
-	eor	r2,r2,r11			@ Ch(e,f,g)
-	eor	r0,r5,r5,ror#11
-	add	r4,r4,r2			@ h+=Ch(e,f,g)
-#if 7==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 7<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r5,r6			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
-	eor	r3,r5,r6			@ a^b, b^c in next round
-	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r5,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r8,r8,r4			@ d+=h
-	eor	r12,r12,r6			@ Maj(a,b,c)
-	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 8
-# if 8==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r8,r8,ror#5
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 8
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 8==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r8,r8,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r11,r11,r2			@ h+=X[i]
-	str	r2,[sp,#8*4]
-	eor	r2,r9,r10
-	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r8
-	add	r11,r11,r12			@ h+=K256[i]
-	eor	r2,r2,r10			@ Ch(e,f,g)
-	eor	r0,r4,r4,ror#11
-	add	r11,r11,r2			@ h+=Ch(e,f,g)
-#if 8==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 8<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r4,r5			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
-	eor	r12,r4,r5			@ a^b, b^c in next round
-	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r4,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r7,r7,r11			@ d+=h
-	eor	r3,r3,r5			@ Maj(a,b,c)
-	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 9
-# if 9==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r7,r7,ror#5
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 9
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 9==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r7,r7,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r10,r10,r2			@ h+=X[i]
-	str	r2,[sp,#9*4]
-	eor	r2,r8,r9
-	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r7
-	add	r10,r10,r3			@ h+=K256[i]
-	eor	r2,r2,r9			@ Ch(e,f,g)
-	eor	r0,r11,r11,ror#11
-	add	r10,r10,r2			@ h+=Ch(e,f,g)
-#if 9==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 9<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r11,r4			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
-	eor	r3,r11,r4			@ a^b, b^c in next round
-	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r11,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r6,r6,r10			@ d+=h
-	eor	r12,r12,r4			@ Maj(a,b,c)
-	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 10
-# if 10==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 10
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 10==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r6,r6,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r9,r9,r2			@ h+=X[i]
-	str	r2,[sp,#10*4]
-	eor	r2,r7,r8
-	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r6
-	add	r9,r9,r12			@ h+=K256[i]
-	eor	r2,r2,r8			@ Ch(e,f,g)
-	eor	r0,r10,r10,ror#11
-	add	r9,r9,r2			@ h+=Ch(e,f,g)
-#if 10==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 10<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r10,r11			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
-	eor	r12,r10,r11			@ a^b, b^c in next round
-	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r10,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r5,r5,r9			@ d+=h
-	eor	r3,r3,r11			@ Maj(a,b,c)
-	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 11
-# if 11==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r5,r5,ror#5
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 11
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 11==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r5,r5,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r8,r8,r2			@ h+=X[i]
-	str	r2,[sp,#11*4]
-	eor	r2,r6,r7
-	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r5
-	add	r8,r8,r3			@ h+=K256[i]
-	eor	r2,r2,r7			@ Ch(e,f,g)
-	eor	r0,r9,r9,ror#11
-	add	r8,r8,r2			@ h+=Ch(e,f,g)
-#if 11==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 11<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r9,r10			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
-	eor	r3,r9,r10			@ a^b, b^c in next round
-	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r9,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r4,r4,r8			@ d+=h
-	eor	r12,r12,r10			@ Maj(a,b,c)
-	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 12
-# if 12==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r4,r4,ror#5
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 12
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 12==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r4,r4,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r7,r7,r2			@ h+=X[i]
-	str	r2,[sp,#12*4]
-	eor	r2,r5,r6
-	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r4
-	add	r7,r7,r12			@ h+=K256[i]
-	eor	r2,r2,r6			@ Ch(e,f,g)
-	eor	r0,r8,r8,ror#11
-	add	r7,r7,r2			@ h+=Ch(e,f,g)
-#if 12==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 12<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r8,r9			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
-	eor	r12,r8,r9			@ a^b, b^c in next round
-	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r8,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r11,r11,r7			@ d+=h
-	eor	r3,r3,r9			@ Maj(a,b,c)
-	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 13
-# if 13==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r11,r11,ror#5
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 13
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 13==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r11,r11,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r6,r6,r2			@ h+=X[i]
-	str	r2,[sp,#13*4]
-	eor	r2,r4,r5
-	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r11
-	add	r6,r6,r3			@ h+=K256[i]
-	eor	r2,r2,r5			@ Ch(e,f,g)
-	eor	r0,r7,r7,ror#11
-	add	r6,r6,r2			@ h+=Ch(e,f,g)
-#if 13==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 13<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r7,r8			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
-	eor	r3,r7,r8			@ a^b, b^c in next round
-	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r7,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r10,r10,r6			@ d+=h
-	eor	r12,r12,r8			@ Maj(a,b,c)
-	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 14
-# if 14==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 14
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 14==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r10,r10,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r5,r5,r2			@ h+=X[i]
-	str	r2,[sp,#14*4]
-	eor	r2,r11,r4
-	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r10
-	add	r5,r5,r12			@ h+=K256[i]
-	eor	r2,r2,r4			@ Ch(e,f,g)
-	eor	r0,r6,r6,ror#11
-	add	r5,r5,r2			@ h+=Ch(e,f,g)
-#if 14==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 14<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r6,r7			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
-	eor	r12,r6,r7			@ a^b, b^c in next round
-	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r6,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r9,r9,r5			@ d+=h
-	eor	r3,r3,r7			@ Maj(a,b,c)
-	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	@ ldr	r2,[r1],#4			@ 15
-# if 15==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r9,r9,ror#5
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 15
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 15==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r9,r9,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r4,r4,r2			@ h+=X[i]
-	str	r2,[sp,#15*4]
-	eor	r2,r10,r11
-	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r9
-	add	r4,r4,r3			@ h+=K256[i]
-	eor	r2,r2,r11			@ Ch(e,f,g)
-	eor	r0,r5,r5,ror#11
-	add	r4,r4,r2			@ h+=Ch(e,f,g)
-#if 15==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 15<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r5,r6			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
-	eor	r3,r5,r6			@ a^b, b^c in next round
-	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r5,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r8,r8,r4			@ d+=h
-	eor	r12,r12,r6			@ Maj(a,b,c)
-	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
-Lrounds_16_xx:
-	@ ldr	r2,[sp,#1*4]		@ 16
-	@ ldr	r1,[sp,#14*4]
-	mov	r0,r2,ror#7
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#0*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#9*4]
-
-	add	r12,r12,r0
-	eor	r0,r8,r8,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r11,r11,r2			@ h+=X[i]
-	str	r2,[sp,#0*4]
-	eor	r2,r9,r10
-	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r8
-	add	r11,r11,r12			@ h+=K256[i]
-	eor	r2,r2,r10			@ Ch(e,f,g)
-	eor	r0,r4,r4,ror#11
-	add	r11,r11,r2			@ h+=Ch(e,f,g)
-#if 16==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 16<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r4,r5			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
-	eor	r12,r4,r5			@ a^b, b^c in next round
-	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r4,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r7,r7,r11			@ d+=h
-	eor	r3,r3,r5			@ Maj(a,b,c)
-	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#2*4]		@ 17
-	@ ldr	r1,[sp,#15*4]
-	mov	r0,r2,ror#7
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#1*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#10*4]
-
-	add	r3,r3,r0
-	eor	r0,r7,r7,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r10,r10,r2			@ h+=X[i]
-	str	r2,[sp,#1*4]
-	eor	r2,r8,r9
-	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r7
-	add	r10,r10,r3			@ h+=K256[i]
-	eor	r2,r2,r9			@ Ch(e,f,g)
-	eor	r0,r11,r11,ror#11
-	add	r10,r10,r2			@ h+=Ch(e,f,g)
-#if 17==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 17<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r11,r4			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
-	eor	r3,r11,r4			@ a^b, b^c in next round
-	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r11,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r6,r6,r10			@ d+=h
-	eor	r12,r12,r4			@ Maj(a,b,c)
-	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#3*4]		@ 18
-	@ ldr	r1,[sp,#0*4]
-	mov	r0,r2,ror#7
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#2*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#11*4]
-
-	add	r12,r12,r0
-	eor	r0,r6,r6,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r9,r9,r2			@ h+=X[i]
-	str	r2,[sp,#2*4]
-	eor	r2,r7,r8
-	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r6
-	add	r9,r9,r12			@ h+=K256[i]
-	eor	r2,r2,r8			@ Ch(e,f,g)
-	eor	r0,r10,r10,ror#11
-	add	r9,r9,r2			@ h+=Ch(e,f,g)
-#if 18==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 18<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r10,r11			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
-	eor	r12,r10,r11			@ a^b, b^c in next round
-	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r10,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r5,r5,r9			@ d+=h
-	eor	r3,r3,r11			@ Maj(a,b,c)
-	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#4*4]		@ 19
-	@ ldr	r1,[sp,#1*4]
-	mov	r0,r2,ror#7
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#3*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#12*4]
-
-	add	r3,r3,r0
-	eor	r0,r5,r5,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r8,r8,r2			@ h+=X[i]
-	str	r2,[sp,#3*4]
-	eor	r2,r6,r7
-	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r5
-	add	r8,r8,r3			@ h+=K256[i]
-	eor	r2,r2,r7			@ Ch(e,f,g)
-	eor	r0,r9,r9,ror#11
-	add	r8,r8,r2			@ h+=Ch(e,f,g)
-#if 19==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 19<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r9,r10			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
-	eor	r3,r9,r10			@ a^b, b^c in next round
-	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r9,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r4,r4,r8			@ d+=h
-	eor	r12,r12,r10			@ Maj(a,b,c)
-	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#5*4]		@ 20
-	@ ldr	r1,[sp,#2*4]
-	mov	r0,r2,ror#7
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#4*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#13*4]
-
-	add	r12,r12,r0
-	eor	r0,r4,r4,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r7,r7,r2			@ h+=X[i]
-	str	r2,[sp,#4*4]
-	eor	r2,r5,r6
-	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r4
-	add	r7,r7,r12			@ h+=K256[i]
-	eor	r2,r2,r6			@ Ch(e,f,g)
-	eor	r0,r8,r8,ror#11
-	add	r7,r7,r2			@ h+=Ch(e,f,g)
-#if 20==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 20<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r8,r9			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
-	eor	r12,r8,r9			@ a^b, b^c in next round
-	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r8,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r11,r11,r7			@ d+=h
-	eor	r3,r3,r9			@ Maj(a,b,c)
-	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#6*4]		@ 21
-	@ ldr	r1,[sp,#3*4]
-	mov	r0,r2,ror#7
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#5*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#14*4]
-
-	add	r3,r3,r0
-	eor	r0,r11,r11,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r6,r6,r2			@ h+=X[i]
-	str	r2,[sp,#5*4]
-	eor	r2,r4,r5
-	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r11
-	add	r6,r6,r3			@ h+=K256[i]
-	eor	r2,r2,r5			@ Ch(e,f,g)
-	eor	r0,r7,r7,ror#11
-	add	r6,r6,r2			@ h+=Ch(e,f,g)
-#if 21==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 21<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r7,r8			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
-	eor	r3,r7,r8			@ a^b, b^c in next round
-	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r7,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r10,r10,r6			@ d+=h
-	eor	r12,r12,r8			@ Maj(a,b,c)
-	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#7*4]		@ 22
-	@ ldr	r1,[sp,#4*4]
-	mov	r0,r2,ror#7
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#6*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#15*4]
-
-	add	r12,r12,r0
-	eor	r0,r10,r10,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r5,r5,r2			@ h+=X[i]
-	str	r2,[sp,#6*4]
-	eor	r2,r11,r4
-	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r10
-	add	r5,r5,r12			@ h+=K256[i]
-	eor	r2,r2,r4			@ Ch(e,f,g)
-	eor	r0,r6,r6,ror#11
-	add	r5,r5,r2			@ h+=Ch(e,f,g)
-#if 22==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 22<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r6,r7			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
-	eor	r12,r6,r7			@ a^b, b^c in next round
-	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r6,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r9,r9,r5			@ d+=h
-	eor	r3,r3,r7			@ Maj(a,b,c)
-	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#8*4]		@ 23
-	@ ldr	r1,[sp,#5*4]
-	mov	r0,r2,ror#7
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#7*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#0*4]
-
-	add	r3,r3,r0
-	eor	r0,r9,r9,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r4,r4,r2			@ h+=X[i]
-	str	r2,[sp,#7*4]
-	eor	r2,r10,r11
-	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r9
-	add	r4,r4,r3			@ h+=K256[i]
-	eor	r2,r2,r11			@ Ch(e,f,g)
-	eor	r0,r5,r5,ror#11
-	add	r4,r4,r2			@ h+=Ch(e,f,g)
-#if 23==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 23<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r5,r6			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
-	eor	r3,r5,r6			@ a^b, b^c in next round
-	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r5,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r8,r8,r4			@ d+=h
-	eor	r12,r12,r6			@ Maj(a,b,c)
-	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#9*4]		@ 24
-	@ ldr	r1,[sp,#6*4]
-	mov	r0,r2,ror#7
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#8*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#1*4]
-
-	add	r12,r12,r0
-	eor	r0,r8,r8,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r11,r11,r2			@ h+=X[i]
-	str	r2,[sp,#8*4]
-	eor	r2,r9,r10
-	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r8
-	add	r11,r11,r12			@ h+=K256[i]
-	eor	r2,r2,r10			@ Ch(e,f,g)
-	eor	r0,r4,r4,ror#11
-	add	r11,r11,r2			@ h+=Ch(e,f,g)
-#if 24==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 24<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r4,r5			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
-	eor	r12,r4,r5			@ a^b, b^c in next round
-	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r4,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r7,r7,r11			@ d+=h
-	eor	r3,r3,r5			@ Maj(a,b,c)
-	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#10*4]		@ 25
-	@ ldr	r1,[sp,#7*4]
-	mov	r0,r2,ror#7
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#9*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#2*4]
-
-	add	r3,r3,r0
-	eor	r0,r7,r7,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r10,r10,r2			@ h+=X[i]
-	str	r2,[sp,#9*4]
-	eor	r2,r8,r9
-	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r7
-	add	r10,r10,r3			@ h+=K256[i]
-	eor	r2,r2,r9			@ Ch(e,f,g)
-	eor	r0,r11,r11,ror#11
-	add	r10,r10,r2			@ h+=Ch(e,f,g)
-#if 25==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 25<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r11,r4			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
-	eor	r3,r11,r4			@ a^b, b^c in next round
-	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r11,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r6,r6,r10			@ d+=h
-	eor	r12,r12,r4			@ Maj(a,b,c)
-	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#11*4]		@ 26
-	@ ldr	r1,[sp,#8*4]
-	mov	r0,r2,ror#7
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#10*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#3*4]
-
-	add	r12,r12,r0
-	eor	r0,r6,r6,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r9,r9,r2			@ h+=X[i]
-	str	r2,[sp,#10*4]
-	eor	r2,r7,r8
-	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r6
-	add	r9,r9,r12			@ h+=K256[i]
-	eor	r2,r2,r8			@ Ch(e,f,g)
-	eor	r0,r10,r10,ror#11
-	add	r9,r9,r2			@ h+=Ch(e,f,g)
-#if 26==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 26<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r10,r11			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
-	eor	r12,r10,r11			@ a^b, b^c in next round
-	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r10,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r5,r5,r9			@ d+=h
-	eor	r3,r3,r11			@ Maj(a,b,c)
-	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#12*4]		@ 27
-	@ ldr	r1,[sp,#9*4]
-	mov	r0,r2,ror#7
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#11*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#4*4]
-
-	add	r3,r3,r0
-	eor	r0,r5,r5,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r8,r8,r2			@ h+=X[i]
-	str	r2,[sp,#11*4]
-	eor	r2,r6,r7
-	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r5
-	add	r8,r8,r3			@ h+=K256[i]
-	eor	r2,r2,r7			@ Ch(e,f,g)
-	eor	r0,r9,r9,ror#11
-	add	r8,r8,r2			@ h+=Ch(e,f,g)
-#if 27==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 27<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r9,r10			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
-	eor	r3,r9,r10			@ a^b, b^c in next round
-	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r9,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r4,r4,r8			@ d+=h
-	eor	r12,r12,r10			@ Maj(a,b,c)
-	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#13*4]		@ 28
-	@ ldr	r1,[sp,#10*4]
-	mov	r0,r2,ror#7
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#12*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#5*4]
-
-	add	r12,r12,r0
-	eor	r0,r4,r4,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r7,r7,r2			@ h+=X[i]
-	str	r2,[sp,#12*4]
-	eor	r2,r5,r6
-	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r4
-	add	r7,r7,r12			@ h+=K256[i]
-	eor	r2,r2,r6			@ Ch(e,f,g)
-	eor	r0,r8,r8,ror#11
-	add	r7,r7,r2			@ h+=Ch(e,f,g)
-#if 28==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 28<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r8,r9			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
-	eor	r12,r8,r9			@ a^b, b^c in next round
-	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r8,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r11,r11,r7			@ d+=h
-	eor	r3,r3,r9			@ Maj(a,b,c)
-	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#14*4]		@ 29
-	@ ldr	r1,[sp,#11*4]
-	mov	r0,r2,ror#7
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#13*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#6*4]
-
-	add	r3,r3,r0
-	eor	r0,r11,r11,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r6,r6,r2			@ h+=X[i]
-	str	r2,[sp,#13*4]
-	eor	r2,r4,r5
-	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r11
-	add	r6,r6,r3			@ h+=K256[i]
-	eor	r2,r2,r5			@ Ch(e,f,g)
-	eor	r0,r7,r7,ror#11
-	add	r6,r6,r2			@ h+=Ch(e,f,g)
-#if 29==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 29<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r7,r8			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
-	eor	r3,r7,r8			@ a^b, b^c in next round
-	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r7,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r10,r10,r6			@ d+=h
-	eor	r12,r12,r8			@ Maj(a,b,c)
-	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#15*4]		@ 30
-	@ ldr	r1,[sp,#12*4]
-	mov	r0,r2,ror#7
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#14*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#7*4]
-
-	add	r12,r12,r0
-	eor	r0,r10,r10,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r5,r5,r2			@ h+=X[i]
-	str	r2,[sp,#14*4]
-	eor	r2,r11,r4
-	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r10
-	add	r5,r5,r12			@ h+=K256[i]
-	eor	r2,r2,r4			@ Ch(e,f,g)
-	eor	r0,r6,r6,ror#11
-	add	r5,r5,r2			@ h+=Ch(e,f,g)
-#if 30==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 30<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r6,r7			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
-	eor	r12,r6,r7			@ a^b, b^c in next round
-	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r6,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r9,r9,r5			@ d+=h
-	eor	r3,r3,r7			@ Maj(a,b,c)
-	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#0*4]		@ 31
-	@ ldr	r1,[sp,#13*4]
-	mov	r0,r2,ror#7
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#15*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#8*4]
-
-	add	r3,r3,r0
-	eor	r0,r9,r9,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r4,r4,r2			@ h+=X[i]
-	str	r2,[sp,#15*4]
-	eor	r2,r10,r11
-	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r9
-	add	r4,r4,r3			@ h+=K256[i]
-	eor	r2,r2,r11			@ Ch(e,f,g)
-	eor	r0,r5,r5,ror#11
-	add	r4,r4,r2			@ h+=Ch(e,f,g)
-#if 31==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 31<15
-# if __ARM_ARCH>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r5,r6			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
-	eor	r3,r5,r6			@ a^b, b^c in next round
-	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r5,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r8,r8,r4			@ d+=h
-	eor	r12,r12,r6			@ Maj(a,b,c)
-	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH>=7
-	ite	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	ldreq	r3,[sp,#16*4]		@ pull ctx
-	bne	Lrounds_16_xx
-
-	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
-	ldr	r0,[r3,#0]
-	ldr	r2,[r3,#4]
-	ldr	r12,[r3,#8]
-	add	r4,r4,r0
-	ldr	r0,[r3,#12]
-	add	r5,r5,r2
-	ldr	r2,[r3,#16]
-	add	r6,r6,r12
-	ldr	r12,[r3,#20]
-	add	r7,r7,r0
-	ldr	r0,[r3,#24]
-	add	r8,r8,r2
-	ldr	r2,[r3,#28]
-	add	r9,r9,r12
-	ldr	r1,[sp,#17*4]		@ pull inp
-	ldr	r12,[sp,#18*4]		@ pull inp+len
-	add	r10,r10,r0
-	add	r11,r11,r2
-	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
-	cmp	r1,r12
-	sub	r14,r14,#256	@ rewind Ktbl
-	bne	Loop
-
-	add	sp,sp,#19*4	@ destroy frame
-#if __ARM_ARCH>=5
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
-#else
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-LK256_shortcut_neon:
-@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
-#if defined(__thumb2__)
-.word	K256-(LK256_add_neon+4)
-#else
-.word	K256-(LK256_add_neon+8)
-#endif
-
-.globl	_sha256_block_data_order_neon
-.private_extern	_sha256_block_data_order_neon
-#ifdef __thumb2__
-.thumb_func	_sha256_block_data_order_neon
-#endif
-.align	5
-.skip	16
-_sha256_block_data_order_neon:
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-
-	sub	r11,sp,#16*4+16
-
-	@ K256 is just at the boundary of being easily referenced by an ADR from
-	@ this function. In Arm mode, when building with __ARM_ARCH=6, it does
-	@ not fit. By moving code around, we could make it fit, but this is too
-	@ fragile. For simplicity, just load the offset from
-	@ .LK256_shortcut_neon.
-	@
-	@ TODO(davidben): adrl would avoid a load, but clang-assembler does not
-	@ support it. We might be able to emulate it with a macro, but Android's
-	@ did not work when I tried it.
-	@ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm
-	ldr	r14,LK256_shortcut_neon
-LK256_add_neon:
-	add	r14,pc,r14
-
-	bic	r11,r11,#15		@ align for 128-bit stores
-	mov	r12,sp
-	mov	sp,r11			@ alloca
-	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
-
-	vld1.8	{q0},[r1]!
-	vld1.8	{q1},[r1]!
-	vld1.8	{q2},[r1]!
-	vld1.8	{q3},[r1]!
-	vld1.32	{q8},[r14,:128]!
-	vld1.32	{q9},[r14,:128]!
-	vld1.32	{q10},[r14,:128]!
-	vld1.32	{q11},[r14,:128]!
-	vrev32.8	q0,q0		@ yes, even on
-	str	r0,[sp,#64]
-	vrev32.8	q1,q1		@ big-endian
-	str	r1,[sp,#68]
-	mov	r1,sp
-	vrev32.8	q2,q2
-	str	r2,[sp,#72]
-	vrev32.8	q3,q3
-	str	r12,[sp,#76]		@ save original sp
-	vadd.i32	q8,q8,q0
-	vadd.i32	q9,q9,q1
-	vst1.32	{q8},[r1,:128]!
-	vadd.i32	q10,q10,q2
-	vst1.32	{q9},[r1,:128]!
-	vadd.i32	q11,q11,q3
-	vst1.32	{q10},[r1,:128]!
-	vst1.32	{q11},[r1,:128]!
-
-	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
-	sub	r1,r1,#64
-	ldr	r2,[sp,#0]
-	eor	r12,r12,r12
-	eor	r3,r5,r6
-	b	L_00_48
-
-.align	4
-L_00_48:
-	vext.8	q8,q0,q1,#4
-	add	r11,r11,r2
-	eor	r2,r9,r10
-	eor	r0,r8,r8,ror#5
-	vext.8	q9,q2,q3,#4
-	add	r4,r4,r12
-	and	r2,r2,r8
-	eor	r12,r0,r8,ror#19
-	vshr.u32	q10,q8,#7
-	eor	r0,r4,r4,ror#11
-	eor	r2,r2,r10
-	vadd.i32	q0,q0,q9
-	add	r11,r11,r12,ror#6
-	eor	r12,r4,r5
-	vshr.u32	q9,q8,#3
-	eor	r0,r0,r4,ror#20
-	add	r11,r11,r2
-	vsli.32	q10,q8,#25
-	ldr	r2,[sp,#4]
-	and	r3,r3,r12
-	vshr.u32	q11,q8,#18
-	add	r7,r7,r11
-	add	r11,r11,r0,ror#2
-	eor	r3,r3,r5
-	veor	q9,q9,q10
-	add	r10,r10,r2
-	vsli.32	q11,q8,#14
-	eor	r2,r8,r9
-	eor	r0,r7,r7,ror#5
-	vshr.u32	d24,d7,#17
-	add	r11,r11,r3
-	and	r2,r2,r7
-	veor	q9,q9,q11
-	eor	r3,r0,r7,ror#19
-	eor	r0,r11,r11,ror#11
-	vsli.32	d24,d7,#15
-	eor	r2,r2,r9
-	add	r10,r10,r3,ror#6
-	vshr.u32	d25,d7,#10
-	eor	r3,r11,r4
-	eor	r0,r0,r11,ror#20
-	vadd.i32	q0,q0,q9
-	add	r10,r10,r2
-	ldr	r2,[sp,#8]
-	veor	d25,d25,d24
-	and	r12,r12,r3
-	add	r6,r6,r10
-	vshr.u32	d24,d7,#19
-	add	r10,r10,r0,ror#2
-	eor	r12,r12,r4
-	vsli.32	d24,d7,#13
-	add	r9,r9,r2
-	eor	r2,r7,r8
-	veor	d25,d25,d24
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12
-	vadd.i32	d0,d0,d25
-	and	r2,r2,r6
-	eor	r12,r0,r6,ror#19
-	vshr.u32	d24,d0,#17
-	eor	r0,r10,r10,ror#11
-	eor	r2,r2,r8
-	vsli.32	d24,d0,#15
-	add	r9,r9,r12,ror#6
-	eor	r12,r10,r11
-	vshr.u32	d25,d0,#10
-	eor	r0,r0,r10,ror#20
-	add	r9,r9,r2
-	veor	d25,d25,d24
-	ldr	r2,[sp,#12]
-	and	r3,r3,r12
-	vshr.u32	d24,d0,#19
-	add	r5,r5,r9
-	add	r9,r9,r0,ror#2
-	eor	r3,r3,r11
-	vld1.32	{q8},[r14,:128]!
-	add	r8,r8,r2
-	vsli.32	d24,d0,#13
-	eor	r2,r6,r7
-	eor	r0,r5,r5,ror#5
-	veor	d25,d25,d24
-	add	r9,r9,r3
-	and	r2,r2,r5
-	vadd.i32	d1,d1,d25
-	eor	r3,r0,r5,ror#19
-	eor	r0,r9,r9,ror#11
-	vadd.i32	q8,q8,q0
-	eor	r2,r2,r7
-	add	r8,r8,r3,ror#6
-	eor	r3,r9,r10
-	eor	r0,r0,r9,ror#20
-	add	r8,r8,r2
-	ldr	r2,[sp,#16]
-	and	r12,r12,r3
-	add	r4,r4,r8
-	vst1.32	{q8},[r1,:128]!
-	add	r8,r8,r0,ror#2
-	eor	r12,r12,r10
-	vext.8	q8,q1,q2,#4
-	add	r7,r7,r2
-	eor	r2,r5,r6
-	eor	r0,r4,r4,ror#5
-	vext.8	q9,q3,q0,#4
-	add	r8,r8,r12
-	and	r2,r2,r4
-	eor	r12,r0,r4,ror#19
-	vshr.u32	q10,q8,#7
-	eor	r0,r8,r8,ror#11
-	eor	r2,r2,r6
-	vadd.i32	q1,q1,q9
-	add	r7,r7,r12,ror#6
-	eor	r12,r8,r9
-	vshr.u32	q9,q8,#3
-	eor	r0,r0,r8,ror#20
-	add	r7,r7,r2
-	vsli.32	q10,q8,#25
-	ldr	r2,[sp,#20]
-	and	r3,r3,r12
-	vshr.u32	q11,q8,#18
-	add	r11,r11,r7
-	add	r7,r7,r0,ror#2
-	eor	r3,r3,r9
-	veor	q9,q9,q10
-	add	r6,r6,r2
-	vsli.32	q11,q8,#14
-	eor	r2,r4,r5
-	eor	r0,r11,r11,ror#5
-	vshr.u32	d24,d1,#17
-	add	r7,r7,r3
-	and	r2,r2,r11
-	veor	q9,q9,q11
-	eor	r3,r0,r11,ror#19
-	eor	r0,r7,r7,ror#11
-	vsli.32	d24,d1,#15
-	eor	r2,r2,r5
-	add	r6,r6,r3,ror#6
-	vshr.u32	d25,d1,#10
-	eor	r3,r7,r8
-	eor	r0,r0,r7,ror#20
-	vadd.i32	q1,q1,q9
-	add	r6,r6,r2
-	ldr	r2,[sp,#24]
-	veor	d25,d25,d24
-	and	r12,r12,r3
-	add	r10,r10,r6
-	vshr.u32	d24,d1,#19
-	add	r6,r6,r0,ror#2
-	eor	r12,r12,r8
-	vsli.32	d24,d1,#13
-	add	r5,r5,r2
-	eor	r2,r11,r4
-	veor	d25,d25,d24
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12
-	vadd.i32	d2,d2,d25
-	and	r2,r2,r10
-	eor	r12,r0,r10,ror#19
-	vshr.u32	d24,d2,#17
-	eor	r0,r6,r6,ror#11
-	eor	r2,r2,r4
-	vsli.32	d24,d2,#15
-	add	r5,r5,r12,ror#6
-	eor	r12,r6,r7
-	vshr.u32	d25,d2,#10
-	eor	r0,r0,r6,ror#20
-	add	r5,r5,r2
-	veor	d25,d25,d24
-	ldr	r2,[sp,#28]
-	and	r3,r3,r12
-	vshr.u32	d24,d2,#19
-	add	r9,r9,r5
-	add	r5,r5,r0,ror#2
-	eor	r3,r3,r7
-	vld1.32	{q8},[r14,:128]!
-	add	r4,r4,r2
-	vsli.32	d24,d2,#13
-	eor	r2,r10,r11
-	eor	r0,r9,r9,ror#5
-	veor	d25,d25,d24
-	add	r5,r5,r3
-	and	r2,r2,r9
-	vadd.i32	d3,d3,d25
-	eor	r3,r0,r9,ror#19
-	eor	r0,r5,r5,ror#11
-	vadd.i32	q8,q8,q1
-	eor	r2,r2,r11
-	add	r4,r4,r3,ror#6
-	eor	r3,r5,r6
-	eor	r0,r0,r5,ror#20
-	add	r4,r4,r2
-	ldr	r2,[sp,#32]
-	and	r12,r12,r3
-	add	r8,r8,r4
-	vst1.32	{q8},[r1,:128]!
-	add	r4,r4,r0,ror#2
-	eor	r12,r12,r6
-	vext.8	q8,q2,q3,#4
-	add	r11,r11,r2
-	eor	r2,r9,r10
-	eor	r0,r8,r8,ror#5
-	vext.8	q9,q0,q1,#4
-	add	r4,r4,r12
-	and	r2,r2,r8
-	eor	r12,r0,r8,ror#19
-	vshr.u32	q10,q8,#7
-	eor	r0,r4,r4,ror#11
-	eor	r2,r2,r10
-	vadd.i32	q2,q2,q9
-	add	r11,r11,r12,ror#6
-	eor	r12,r4,r5
-	vshr.u32	q9,q8,#3
-	eor	r0,r0,r4,ror#20
-	add	r11,r11,r2
-	vsli.32	q10,q8,#25
-	ldr	r2,[sp,#36]
-	and	r3,r3,r12
-	vshr.u32	q11,q8,#18
-	add	r7,r7,r11
-	add	r11,r11,r0,ror#2
-	eor	r3,r3,r5
-	veor	q9,q9,q10
-	add	r10,r10,r2
-	vsli.32	q11,q8,#14
-	eor	r2,r8,r9
-	eor	r0,r7,r7,ror#5
-	vshr.u32	d24,d3,#17
-	add	r11,r11,r3
-	and	r2,r2,r7
-	veor	q9,q9,q11
-	eor	r3,r0,r7,ror#19
-	eor	r0,r11,r11,ror#11
-	vsli.32	d24,d3,#15
-	eor	r2,r2,r9
-	add	r10,r10,r3,ror#6
-	vshr.u32	d25,d3,#10
-	eor	r3,r11,r4
-	eor	r0,r0,r11,ror#20
-	vadd.i32	q2,q2,q9
-	add	r10,r10,r2
-	ldr	r2,[sp,#40]
-	veor	d25,d25,d24
-	and	r12,r12,r3
-	add	r6,r6,r10
-	vshr.u32	d24,d3,#19
-	add	r10,r10,r0,ror#2
-	eor	r12,r12,r4
-	vsli.32	d24,d3,#13
-	add	r9,r9,r2
-	eor	r2,r7,r8
-	veor	d25,d25,d24
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12
-	vadd.i32	d4,d4,d25
-	and	r2,r2,r6
-	eor	r12,r0,r6,ror#19
-	vshr.u32	d24,d4,#17
-	eor	r0,r10,r10,ror#11
-	eor	r2,r2,r8
-	vsli.32	d24,d4,#15
-	add	r9,r9,r12,ror#6
-	eor	r12,r10,r11
-	vshr.u32	d25,d4,#10
-	eor	r0,r0,r10,ror#20
-	add	r9,r9,r2
-	veor	d25,d25,d24
-	ldr	r2,[sp,#44]
-	and	r3,r3,r12
-	vshr.u32	d24,d4,#19
-	add	r5,r5,r9
-	add	r9,r9,r0,ror#2
-	eor	r3,r3,r11
-	vld1.32	{q8},[r14,:128]!
-	add	r8,r8,r2
-	vsli.32	d24,d4,#13
-	eor	r2,r6,r7
-	eor	r0,r5,r5,ror#5
-	veor	d25,d25,d24
-	add	r9,r9,r3
-	and	r2,r2,r5
-	vadd.i32	d5,d5,d25
-	eor	r3,r0,r5,ror#19
-	eor	r0,r9,r9,ror#11
-	vadd.i32	q8,q8,q2
-	eor	r2,r2,r7
-	add	r8,r8,r3,ror#6
-	eor	r3,r9,r10
-	eor	r0,r0,r9,ror#20
-	add	r8,r8,r2
-	ldr	r2,[sp,#48]
-	and	r12,r12,r3
-	add	r4,r4,r8
-	vst1.32	{q8},[r1,:128]!
-	add	r8,r8,r0,ror#2
-	eor	r12,r12,r10
-	vext.8	q8,q3,q0,#4
-	add	r7,r7,r2
-	eor	r2,r5,r6
-	eor	r0,r4,r4,ror#5
-	vext.8	q9,q1,q2,#4
-	add	r8,r8,r12
-	and	r2,r2,r4
-	eor	r12,r0,r4,ror#19
-	vshr.u32	q10,q8,#7
-	eor	r0,r8,r8,ror#11
-	eor	r2,r2,r6
-	vadd.i32	q3,q3,q9
-	add	r7,r7,r12,ror#6
-	eor	r12,r8,r9
-	vshr.u32	q9,q8,#3
-	eor	r0,r0,r8,ror#20
-	add	r7,r7,r2
-	vsli.32	q10,q8,#25
-	ldr	r2,[sp,#52]
-	and	r3,r3,r12
-	vshr.u32	q11,q8,#18
-	add	r11,r11,r7
-	add	r7,r7,r0,ror#2
-	eor	r3,r3,r9
-	veor	q9,q9,q10
-	add	r6,r6,r2
-	vsli.32	q11,q8,#14
-	eor	r2,r4,r5
-	eor	r0,r11,r11,ror#5
-	vshr.u32	d24,d5,#17
-	add	r7,r7,r3
-	and	r2,r2,r11
-	veor	q9,q9,q11
-	eor	r3,r0,r11,ror#19
-	eor	r0,r7,r7,ror#11
-	vsli.32	d24,d5,#15
-	eor	r2,r2,r5
-	add	r6,r6,r3,ror#6
-	vshr.u32	d25,d5,#10
-	eor	r3,r7,r8
-	eor	r0,r0,r7,ror#20
-	vadd.i32	q3,q3,q9
-	add	r6,r6,r2
-	ldr	r2,[sp,#56]
-	veor	d25,d25,d24
-	and	r12,r12,r3
-	add	r10,r10,r6
-	vshr.u32	d24,d5,#19
-	add	r6,r6,r0,ror#2
-	eor	r12,r12,r8
-	vsli.32	d24,d5,#13
-	add	r5,r5,r2
-	eor	r2,r11,r4
-	veor	d25,d25,d24
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12
-	vadd.i32	d6,d6,d25
-	and	r2,r2,r10
-	eor	r12,r0,r10,ror#19
-	vshr.u32	d24,d6,#17
-	eor	r0,r6,r6,ror#11
-	eor	r2,r2,r4
-	vsli.32	d24,d6,#15
-	add	r5,r5,r12,ror#6
-	eor	r12,r6,r7
-	vshr.u32	d25,d6,#10
-	eor	r0,r0,r6,ror#20
-	add	r5,r5,r2
-	veor	d25,d25,d24
-	ldr	r2,[sp,#60]
-	and	r3,r3,r12
-	vshr.u32	d24,d6,#19
-	add	r9,r9,r5
-	add	r5,r5,r0,ror#2
-	eor	r3,r3,r7
-	vld1.32	{q8},[r14,:128]!
-	add	r4,r4,r2
-	vsli.32	d24,d6,#13
-	eor	r2,r10,r11
-	eor	r0,r9,r9,ror#5
-	veor	d25,d25,d24
-	add	r5,r5,r3
-	and	r2,r2,r9
-	vadd.i32	d7,d7,d25
-	eor	r3,r0,r9,ror#19
-	eor	r0,r5,r5,ror#11
-	vadd.i32	q8,q8,q3
-	eor	r2,r2,r11
-	add	r4,r4,r3,ror#6
-	eor	r3,r5,r6
-	eor	r0,r0,r5,ror#20
-	add	r4,r4,r2
-	ldr	r2,[r14]
-	and	r12,r12,r3
-	add	r8,r8,r4
-	vst1.32	{q8},[r1,:128]!
-	add	r4,r4,r0,ror#2
-	eor	r12,r12,r6
-	teq	r2,#0				@ check for K256 terminator
-	ldr	r2,[sp,#0]
-	sub	r1,r1,#64
-	bne	L_00_48
-
-	ldr	r1,[sp,#68]
-	ldr	r0,[sp,#72]
-	sub	r14,r14,#256	@ rewind r14
-	teq	r1,r0
-	it	eq
-	subeq	r1,r1,#64		@ avoid SEGV
-	vld1.8	{q0},[r1]!		@ load next input block
-	vld1.8	{q1},[r1]!
-	vld1.8	{q2},[r1]!
-	vld1.8	{q3},[r1]!
-	it	ne
-	strne	r1,[sp,#68]
-	mov	r1,sp
-	add	r11,r11,r2
-	eor	r2,r9,r10
-	eor	r0,r8,r8,ror#5
-	add	r4,r4,r12
-	vld1.32	{q8},[r14,:128]!
-	and	r2,r2,r8
-	eor	r12,r0,r8,ror#19
-	eor	r0,r4,r4,ror#11
-	eor	r2,r2,r10
-	vrev32.8	q0,q0
-	add	r11,r11,r12,ror#6
-	eor	r12,r4,r5
-	eor	r0,r0,r4,ror#20
-	add	r11,r11,r2
-	vadd.i32	q8,q8,q0
-	ldr	r2,[sp,#4]
-	and	r3,r3,r12
-	add	r7,r7,r11
-	add	r11,r11,r0,ror#2
-	eor	r3,r3,r5
-	add	r10,r10,r2
-	eor	r2,r8,r9
-	eor	r0,r7,r7,ror#5
-	add	r11,r11,r3
-	and	r2,r2,r7
-	eor	r3,r0,r7,ror#19
-	eor	r0,r11,r11,ror#11
-	eor	r2,r2,r9
-	add	r10,r10,r3,ror#6
-	eor	r3,r11,r4
-	eor	r0,r0,r11,ror#20
-	add	r10,r10,r2
-	ldr	r2,[sp,#8]
-	and	r12,r12,r3
-	add	r6,r6,r10
-	add	r10,r10,r0,ror#2
-	eor	r12,r12,r4
-	add	r9,r9,r2
-	eor	r2,r7,r8
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12
-	and	r2,r2,r6
-	eor	r12,r0,r6,ror#19
-	eor	r0,r10,r10,ror#11
-	eor	r2,r2,r8
-	add	r9,r9,r12,ror#6
-	eor	r12,r10,r11
-	eor	r0,r0,r10,ror#20
-	add	r9,r9,r2
-	ldr	r2,[sp,#12]
-	and	r3,r3,r12
-	add	r5,r5,r9
-	add	r9,r9,r0,ror#2
-	eor	r3,r3,r11
-	add	r8,r8,r2
-	eor	r2,r6,r7
-	eor	r0,r5,r5,ror#5
-	add	r9,r9,r3
-	and	r2,r2,r5
-	eor	r3,r0,r5,ror#19
-	eor	r0,r9,r9,ror#11
-	eor	r2,r2,r7
-	add	r8,r8,r3,ror#6
-	eor	r3,r9,r10
-	eor	r0,r0,r9,ror#20
-	add	r8,r8,r2
-	ldr	r2,[sp,#16]
-	and	r12,r12,r3
-	add	r4,r4,r8
-	add	r8,r8,r0,ror#2
-	eor	r12,r12,r10
-	vst1.32	{q8},[r1,:128]!
-	add	r7,r7,r2
-	eor	r2,r5,r6
-	eor	r0,r4,r4,ror#5
-	add	r8,r8,r12
-	vld1.32	{q8},[r14,:128]!
-	and	r2,r2,r4
-	eor	r12,r0,r4,ror#19
-	eor	r0,r8,r8,ror#11
-	eor	r2,r2,r6
-	vrev32.8	q1,q1
-	add	r7,r7,r12,ror#6
-	eor	r12,r8,r9
-	eor	r0,r0,r8,ror#20
-	add	r7,r7,r2
-	vadd.i32	q8,q8,q1
-	ldr	r2,[sp,#20]
-	and	r3,r3,r12
-	add	r11,r11,r7
-	add	r7,r7,r0,ror#2
-	eor	r3,r3,r9
-	add	r6,r6,r2
-	eor	r2,r4,r5
-	eor	r0,r11,r11,ror#5
-	add	r7,r7,r3
-	and	r2,r2,r11
-	eor	r3,r0,r11,ror#19
-	eor	r0,r7,r7,ror#11
-	eor	r2,r2,r5
-	add	r6,r6,r3,ror#6
-	eor	r3,r7,r8
-	eor	r0,r0,r7,ror#20
-	add	r6,r6,r2
-	ldr	r2,[sp,#24]
-	and	r12,r12,r3
-	add	r10,r10,r6
-	add	r6,r6,r0,ror#2
-	eor	r12,r12,r8
-	add	r5,r5,r2
-	eor	r2,r11,r4
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12
-	and	r2,r2,r10
-	eor	r12,r0,r10,ror#19
-	eor	r0,r6,r6,ror#11
-	eor	r2,r2,r4
-	add	r5,r5,r12,ror#6
-	eor	r12,r6,r7
-	eor	r0,r0,r6,ror#20
-	add	r5,r5,r2
-	ldr	r2,[sp,#28]
-	and	r3,r3,r12
-	add	r9,r9,r5
-	add	r5,r5,r0,ror#2
-	eor	r3,r3,r7
-	add	r4,r4,r2
-	eor	r2,r10,r11
-	eor	r0,r9,r9,ror#5
-	add	r5,r5,r3
-	and	r2,r2,r9
-	eor	r3,r0,r9,ror#19
-	eor	r0,r5,r5,ror#11
-	eor	r2,r2,r11
-	add	r4,r4,r3,ror#6
-	eor	r3,r5,r6
-	eor	r0,r0,r5,ror#20
-	add	r4,r4,r2
-	ldr	r2,[sp,#32]
-	and	r12,r12,r3
-	add	r8,r8,r4
-	add	r4,r4,r0,ror#2
-	eor	r12,r12,r6
-	vst1.32	{q8},[r1,:128]!
-	add	r11,r11,r2
-	eor	r2,r9,r10
-	eor	r0,r8,r8,ror#5
-	add	r4,r4,r12
-	vld1.32	{q8},[r14,:128]!
-	and	r2,r2,r8
-	eor	r12,r0,r8,ror#19
-	eor	r0,r4,r4,ror#11
-	eor	r2,r2,r10
-	vrev32.8	q2,q2
-	add	r11,r11,r12,ror#6
-	eor	r12,r4,r5
-	eor	r0,r0,r4,ror#20
-	add	r11,r11,r2
-	vadd.i32	q8,q8,q2
-	ldr	r2,[sp,#36]
-	and	r3,r3,r12
-	add	r7,r7,r11
-	add	r11,r11,r0,ror#2
-	eor	r3,r3,r5
-	add	r10,r10,r2
-	eor	r2,r8,r9
-	eor	r0,r7,r7,ror#5
-	add	r11,r11,r3
-	and	r2,r2,r7
-	eor	r3,r0,r7,ror#19
-	eor	r0,r11,r11,ror#11
-	eor	r2,r2,r9
-	add	r10,r10,r3,ror#6
-	eor	r3,r11,r4
-	eor	r0,r0,r11,ror#20
-	add	r10,r10,r2
-	ldr	r2,[sp,#40]
-	and	r12,r12,r3
-	add	r6,r6,r10
-	add	r10,r10,r0,ror#2
-	eor	r12,r12,r4
-	add	r9,r9,r2
-	eor	r2,r7,r8
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12
-	and	r2,r2,r6
-	eor	r12,r0,r6,ror#19
-	eor	r0,r10,r10,ror#11
-	eor	r2,r2,r8
-	add	r9,r9,r12,ror#6
-	eor	r12,r10,r11
-	eor	r0,r0,r10,ror#20
-	add	r9,r9,r2
-	ldr	r2,[sp,#44]
-	and	r3,r3,r12
-	add	r5,r5,r9
-	add	r9,r9,r0,ror#2
-	eor	r3,r3,r11
-	add	r8,r8,r2
-	eor	r2,r6,r7
-	eor	r0,r5,r5,ror#5
-	add	r9,r9,r3
-	and	r2,r2,r5
-	eor	r3,r0,r5,ror#19
-	eor	r0,r9,r9,ror#11
-	eor	r2,r2,r7
-	add	r8,r8,r3,ror#6
-	eor	r3,r9,r10
-	eor	r0,r0,r9,ror#20
-	add	r8,r8,r2
-	ldr	r2,[sp,#48]
-	and	r12,r12,r3
-	add	r4,r4,r8
-	add	r8,r8,r0,ror#2
-	eor	r12,r12,r10
-	vst1.32	{q8},[r1,:128]!
-	add	r7,r7,r2
-	eor	r2,r5,r6
-	eor	r0,r4,r4,ror#5
-	add	r8,r8,r12
-	vld1.32	{q8},[r14,:128]!
-	and	r2,r2,r4
-	eor	r12,r0,r4,ror#19
-	eor	r0,r8,r8,ror#11
-	eor	r2,r2,r6
-	vrev32.8	q3,q3
-	add	r7,r7,r12,ror#6
-	eor	r12,r8,r9
-	eor	r0,r0,r8,ror#20
-	add	r7,r7,r2
-	vadd.i32	q8,q8,q3
-	ldr	r2,[sp,#52]
-	and	r3,r3,r12
-	add	r11,r11,r7
-	add	r7,r7,r0,ror#2
-	eor	r3,r3,r9
-	add	r6,r6,r2
-	eor	r2,r4,r5
-	eor	r0,r11,r11,ror#5
-	add	r7,r7,r3
-	and	r2,r2,r11
-	eor	r3,r0,r11,ror#19
-	eor	r0,r7,r7,ror#11
-	eor	r2,r2,r5
-	add	r6,r6,r3,ror#6
-	eor	r3,r7,r8
-	eor	r0,r0,r7,ror#20
-	add	r6,r6,r2
-	ldr	r2,[sp,#56]
-	and	r12,r12,r3
-	add	r10,r10,r6
-	add	r6,r6,r0,ror#2
-	eor	r12,r12,r8
-	add	r5,r5,r2
-	eor	r2,r11,r4
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12
-	and	r2,r2,r10
-	eor	r12,r0,r10,ror#19
-	eor	r0,r6,r6,ror#11
-	eor	r2,r2,r4
-	add	r5,r5,r12,ror#6
-	eor	r12,r6,r7
-	eor	r0,r0,r6,ror#20
-	add	r5,r5,r2
-	ldr	r2,[sp,#60]
-	and	r3,r3,r12
-	add	r9,r9,r5
-	add	r5,r5,r0,ror#2
-	eor	r3,r3,r7
-	add	r4,r4,r2
-	eor	r2,r10,r11
-	eor	r0,r9,r9,ror#5
-	add	r5,r5,r3
-	and	r2,r2,r9
-	eor	r3,r0,r9,ror#19
-	eor	r0,r5,r5,ror#11
-	eor	r2,r2,r11
-	add	r4,r4,r3,ror#6
-	eor	r3,r5,r6
-	eor	r0,r0,r5,ror#20
-	add	r4,r4,r2
-	ldr	r2,[sp,#64]
-	and	r12,r12,r3
-	add	r8,r8,r4
-	add	r4,r4,r0,ror#2
-	eor	r12,r12,r6
-	vst1.32	{q8},[r1,:128]!
-	ldr	r0,[r2,#0]
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	ldr	r12,[r2,#4]
-	ldr	r3,[r2,#8]
-	ldr	r1,[r2,#12]
-	add	r4,r4,r0			@ accumulate
-	ldr	r0,[r2,#16]
-	add	r5,r5,r12
-	ldr	r12,[r2,#20]
-	add	r6,r6,r3
-	ldr	r3,[r2,#24]
-	add	r7,r7,r1
-	ldr	r1,[r2,#28]
-	add	r8,r8,r0
-	str	r4,[r2],#4
-	add	r9,r9,r12
-	str	r5,[r2],#4
-	add	r10,r10,r3
-	str	r6,[r2],#4
-	add	r11,r11,r1
-	str	r7,[r2],#4
-	stmia	r2,{r8,r9,r10,r11}
-
-	ittte	ne
-	movne	r1,sp
-	ldrne	r2,[sp,#0]
-	eorne	r12,r12,r12
-	ldreq	sp,[sp,#76]			@ restore original sp
-	itt	ne
-	eorne	r3,r5,r6
-	bne	L_00_48
-
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-
-# if defined(__thumb2__)
-#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
-# else
-#  define INST(a,b,c,d)	.byte	a,b,c,d
-# endif
-
-LK256_shortcut_hw:
-@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
-#if defined(__thumb2__)
-.word	K256-(LK256_add_hw+4)
-#else
-.word	K256-(LK256_add_hw+8)
-#endif
-
-.globl	_sha256_block_data_order_hw
-.private_extern	_sha256_block_data_order_hw
-#ifdef __thumb2__
-.thumb_func	_sha256_block_data_order_hw
-#endif
-.align	5
-_sha256_block_data_order_hw:
-	@ K256 is too far to reference from one ADR command in Thumb mode. In
-	@ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
-	@ boundary. For simplicity, just load the offset from .LK256_shortcut_hw.
-	ldr	r3,LK256_shortcut_hw
-LK256_add_hw:
-	add	r3,pc,r3
-
-	vld1.32	{q0,q1},[r0]
-	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
-	b	Loop_v8
-
-.align	4
-Loop_v8:
-	vld1.8	{q8,q9},[r1]!
-	vld1.8	{q10,q11},[r1]!
-	vld1.32	{q12},[r3]!
-	vrev32.8	q8,q8
-	vrev32.8	q9,q9
-	vrev32.8	q10,q10
-	vrev32.8	q11,q11
-	vmov	q14,q0	@ offload
-	vmov	q15,q1
-	teq	r1,r2
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q8
-	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q9
-	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q10
-	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q11
-	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q8
-	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q9
-	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q10
-	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q11
-	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q8
-	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q9
-	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q10
-	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q11
-	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q8
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q9
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-
-	vld1.32	{q13},[r3]
-	vadd.i32	q12,q12,q10
-	sub	r3,r3,#256-16	@ rewind
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-
-	vadd.i32	q13,q13,q11
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-
-	vadd.i32	q0,q0,q14
-	vadd.i32	q1,q1,q15
-	it	ne
-	bne	Loop_v8
-
-	vst1.32	{q0,q1},[r0]
-
-	bx	lr		@ bx lr
-
-#endif
-.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
-#endif  // defined(__arm__) && defined(__APPLE__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-windows.windows.x86.S
deleted file mode 100644
index 8e1da2bb..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-windows.windows.x86.S
+++ /dev/null
@@ -1,2853 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-;extern	_OPENSSL_ia32cap_P
-global	_sha512_block_data_order
-align	16
-_sha512_block_data_order:
-L$_sha512_block_data_order_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	ebx,esp
-	call	L$000pic_point
-L$000pic_point:
-	pop	ebp
-	lea	ebp,[(L$001K512-L$000pic_point)+ebp]
-	sub	esp,16
-	and	esp,-64
-	shl	eax,7
-	add	eax,edi
-	mov	DWORD [esp],esi
-	mov	DWORD [4+esp],edi
-	mov	DWORD [8+esp],eax
-	mov	DWORD [12+esp],ebx
-	lea	edx,[_OPENSSL_ia32cap_P]
-	mov	ecx,DWORD [edx]
-	test	ecx,67108864
-	jz	NEAR L$002loop_x86
-	mov	edx,DWORD [4+edx]
-	movq	mm0,[esi]
-	and	ecx,16777216
-	movq	mm1,[8+esi]
-	and	edx,512
-	movq	mm2,[16+esi]
-	or	ecx,edx
-	movq	mm3,[24+esi]
-	movq	mm4,[32+esi]
-	movq	mm5,[40+esi]
-	movq	mm6,[48+esi]
-	movq	mm7,[56+esi]
-	cmp	ecx,16777728
-	je	NEAR L$003SSSE3
-	sub	esp,80
-	jmp	NEAR L$004loop_sse2
-align	16
-L$004loop_sse2:
-	movq	[8+esp],mm1
-	movq	[16+esp],mm2
-	movq	[24+esp],mm3
-	movq	[40+esp],mm5
-	movq	[48+esp],mm6
-	pxor	mm2,mm1
-	movq	[56+esp],mm7
-	movq	mm3,mm0
-	mov	eax,DWORD [edi]
-	mov	ebx,DWORD [4+edi]
-	add	edi,8
-	mov	edx,15
-	bswap	eax
-	bswap	ebx
-	jmp	NEAR L$00500_14_sse2
-align	16
-L$00500_14_sse2:
-	movd	mm1,eax
-	mov	eax,DWORD [edi]
-	movd	mm7,ebx
-	mov	ebx,DWORD [4+edi]
-	add	edi,8
-	bswap	eax
-	bswap	ebx
-	punpckldq	mm7,mm1
-	movq	mm1,mm4
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[32+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	movq	mm0,mm3
-	movq	[72+esp],mm7
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[56+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	paddq	mm7,[ebp]
-	pxor	mm3,mm4
-	movq	mm4,[24+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[8+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	sub	esp,8
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[40+esp]
-	paddq	mm3,mm2
-	movq	mm2,mm0
-	add	ebp,8
-	paddq	mm3,mm6
-	movq	mm6,[48+esp]
-	dec	edx
-	jnz	NEAR L$00500_14_sse2
-	movd	mm1,eax
-	movd	mm7,ebx
-	punpckldq	mm7,mm1
-	movq	mm1,mm4
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[32+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	movq	mm0,mm3
-	movq	[72+esp],mm7
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[56+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	paddq	mm7,[ebp]
-	pxor	mm3,mm4
-	movq	mm4,[24+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[8+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	sub	esp,8
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm7,[192+esp]
-	paddq	mm3,mm2
-	movq	mm2,mm0
-	add	ebp,8
-	paddq	mm3,mm6
-	pxor	mm0,mm0
-	mov	edx,32
-	jmp	NEAR L$00616_79_sse2
-align	16
-L$00616_79_sse2:
-	movq	mm5,[88+esp]
-	movq	mm1,mm7
-	psrlq	mm7,1
-	movq	mm6,mm5
-	psrlq	mm5,6
-	psllq	mm1,56
-	paddq	mm0,mm3
-	movq	mm3,mm7
-	psrlq	mm7,6
-	pxor	mm3,mm1
-	psllq	mm1,7
-	pxor	mm3,mm7
-	psrlq	mm7,1
-	pxor	mm3,mm1
-	movq	mm1,mm5
-	psrlq	mm5,13
-	pxor	mm7,mm3
-	psllq	mm6,3
-	pxor	mm1,mm5
-	paddq	mm7,[200+esp]
-	pxor	mm1,mm6
-	psrlq	mm5,42
-	paddq	mm7,[128+esp]
-	pxor	mm1,mm5
-	psllq	mm6,42
-	movq	mm5,[40+esp]
-	pxor	mm1,mm6
-	movq	mm6,[48+esp]
-	paddq	mm7,mm1
-	movq	mm1,mm4
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[32+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	movq	[72+esp],mm7
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[56+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	paddq	mm7,[ebp]
-	pxor	mm3,mm4
-	movq	mm4,[24+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[8+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	sub	esp,8
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm7,[192+esp]
-	paddq	mm2,mm6
-	add	ebp,8
-	movq	mm5,[88+esp]
-	movq	mm1,mm7
-	psrlq	mm7,1
-	movq	mm6,mm5
-	psrlq	mm5,6
-	psllq	mm1,56
-	paddq	mm2,mm3
-	movq	mm3,mm7
-	psrlq	mm7,6
-	pxor	mm3,mm1
-	psllq	mm1,7
-	pxor	mm3,mm7
-	psrlq	mm7,1
-	pxor	mm3,mm1
-	movq	mm1,mm5
-	psrlq	mm5,13
-	pxor	mm7,mm3
-	psllq	mm6,3
-	pxor	mm1,mm5
-	paddq	mm7,[200+esp]
-	pxor	mm1,mm6
-	psrlq	mm5,42
-	paddq	mm7,[128+esp]
-	pxor	mm1,mm5
-	psllq	mm6,42
-	movq	mm5,[40+esp]
-	pxor	mm1,mm6
-	movq	mm6,[48+esp]
-	paddq	mm7,mm1
-	movq	mm1,mm4
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[32+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	movq	[72+esp],mm7
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[56+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	paddq	mm7,[ebp]
-	pxor	mm3,mm4
-	movq	mm4,[24+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[8+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	sub	esp,8
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm7,[192+esp]
-	paddq	mm0,mm6
-	add	ebp,8
-	dec	edx
-	jnz	NEAR L$00616_79_sse2
-	paddq	mm0,mm3
-	movq	mm1,[8+esp]
-	movq	mm3,[24+esp]
-	movq	mm5,[40+esp]
-	movq	mm6,[48+esp]
-	movq	mm7,[56+esp]
-	pxor	mm2,mm1
-	paddq	mm0,[esi]
-	paddq	mm1,[8+esi]
-	paddq	mm2,[16+esi]
-	paddq	mm3,[24+esi]
-	paddq	mm4,[32+esi]
-	paddq	mm5,[40+esi]
-	paddq	mm6,[48+esi]
-	paddq	mm7,[56+esi]
-	mov	eax,640
-	movq	[esi],mm0
-	movq	[8+esi],mm1
-	movq	[16+esi],mm2
-	movq	[24+esi],mm3
-	movq	[32+esi],mm4
-	movq	[40+esi],mm5
-	movq	[48+esi],mm6
-	movq	[56+esi],mm7
-	lea	esp,[eax*1+esp]
-	sub	ebp,eax
-	cmp	edi,DWORD [88+esp]
-	jb	NEAR L$004loop_sse2
-	mov	esp,DWORD [92+esp]
-	emms
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-align	32
-L$003SSSE3:
-	lea	edx,[esp-64]
-	sub	esp,256
-	movdqa	xmm1,[640+ebp]
-	movdqu	xmm0,[edi]
-db	102,15,56,0,193
-	movdqa	xmm3,[ebp]
-	movdqa	xmm2,xmm1
-	movdqu	xmm1,[16+edi]
-	paddq	xmm3,xmm0
-db	102,15,56,0,202
-	movdqa	[edx-128],xmm3
-	movdqa	xmm4,[16+ebp]
-	movdqa	xmm3,xmm2
-	movdqu	xmm2,[32+edi]
-	paddq	xmm4,xmm1
-db	102,15,56,0,211
-	movdqa	[edx-112],xmm4
-	movdqa	xmm5,[32+ebp]
-	movdqa	xmm4,xmm3
-	movdqu	xmm3,[48+edi]
-	paddq	xmm5,xmm2
-db	102,15,56,0,220
-	movdqa	[edx-96],xmm5
-	movdqa	xmm6,[48+ebp]
-	movdqa	xmm5,xmm4
-	movdqu	xmm4,[64+edi]
-	paddq	xmm6,xmm3
-db	102,15,56,0,229
-	movdqa	[edx-80],xmm6
-	movdqa	xmm7,[64+ebp]
-	movdqa	xmm6,xmm5
-	movdqu	xmm5,[80+edi]
-	paddq	xmm7,xmm4
-db	102,15,56,0,238
-	movdqa	[edx-64],xmm7
-	movdqa	[edx],xmm0
-	movdqa	xmm0,[80+ebp]
-	movdqa	xmm7,xmm6
-	movdqu	xmm6,[96+edi]
-	paddq	xmm0,xmm5
-db	102,15,56,0,247
-	movdqa	[edx-48],xmm0
-	movdqa	[16+edx],xmm1
-	movdqa	xmm1,[96+ebp]
-	movdqa	xmm0,xmm7
-	movdqu	xmm7,[112+edi]
-	paddq	xmm1,xmm6
-db	102,15,56,0,248
-	movdqa	[edx-32],xmm1
-	movdqa	[32+edx],xmm2
-	movdqa	xmm2,[112+ebp]
-	movdqa	xmm0,[edx]
-	paddq	xmm2,xmm7
-	movdqa	[edx-16],xmm2
-	nop
-align	32
-L$007loop_ssse3:
-	movdqa	xmm2,[16+edx]
-	movdqa	[48+edx],xmm3
-	lea	ebp,[128+ebp]
-	movq	[8+esp],mm1
-	mov	ebx,edi
-	movq	[16+esp],mm2
-	lea	edi,[128+edi]
-	movq	[24+esp],mm3
-	cmp	edi,eax
-	movq	[40+esp],mm5
-	cmovb	ebx,edi
-	movq	[48+esp],mm6
-	mov	ecx,4
-	pxor	mm2,mm1
-	movq	[56+esp],mm7
-	pxor	mm3,mm3
-	jmp	NEAR L$00800_47_ssse3
-align	32
-L$00800_47_ssse3:
-	movdqa	xmm3,xmm5
-	movdqa	xmm1,xmm2
-db	102,15,58,15,208,8
-	movdqa	[edx],xmm4
-db	102,15,58,15,220,8
-	movdqa	xmm4,xmm2
-	psrlq	xmm2,7
-	paddq	xmm0,xmm3
-	movdqa	xmm3,xmm4
-	psrlq	xmm4,1
-	psllq	xmm3,56
-	pxor	xmm2,xmm4
-	psrlq	xmm4,7
-	pxor	xmm2,xmm3
-	psllq	xmm3,7
-	pxor	xmm2,xmm4
-	movdqa	xmm4,xmm7
-	pxor	xmm2,xmm3
-	movdqa	xmm3,xmm7
-	psrlq	xmm4,6
-	paddq	xmm0,xmm2
-	movdqa	xmm2,xmm7
-	psrlq	xmm3,19
-	psllq	xmm2,3
-	pxor	xmm4,xmm3
-	psrlq	xmm3,42
-	pxor	xmm4,xmm2
-	psllq	xmm2,42
-	pxor	xmm4,xmm3
-	movdqa	xmm3,[32+edx]
-	pxor	xmm4,xmm2
-	movdqa	xmm2,[ebp]
-	movq	mm1,mm4
-	paddq	xmm0,xmm4
-	movq	mm7,[edx-128]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[32+esp],mm4
-	paddq	xmm2,xmm0
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[56+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[24+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[8+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[32+esp]
-	paddq	mm2,mm6
-	movq	mm6,[40+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-120]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[24+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[56+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[48+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[16+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[24+esp]
-	paddq	mm0,mm6
-	movq	mm6,[32+esp]
-	movdqa	[edx-128],xmm2
-	movdqa	xmm4,xmm6
-	movdqa	xmm2,xmm3
-db	102,15,58,15,217,8
-	movdqa	[16+edx],xmm5
-db	102,15,58,15,229,8
-	movdqa	xmm5,xmm3
-	psrlq	xmm3,7
-	paddq	xmm1,xmm4
-	movdqa	xmm4,xmm5
-	psrlq	xmm5,1
-	psllq	xmm4,56
-	pxor	xmm3,xmm5
-	psrlq	xmm5,7
-	pxor	xmm3,xmm4
-	psllq	xmm4,7
-	pxor	xmm3,xmm5
-	movdqa	xmm5,xmm0
-	pxor	xmm3,xmm4
-	movdqa	xmm4,xmm0
-	psrlq	xmm5,6
-	paddq	xmm1,xmm3
-	movdqa	xmm3,xmm0
-	psrlq	xmm4,19
-	psllq	xmm3,3
-	pxor	xmm5,xmm4
-	psrlq	xmm4,42
-	pxor	xmm5,xmm3
-	psllq	xmm3,42
-	pxor	xmm5,xmm4
-	movdqa	xmm4,[48+edx]
-	pxor	xmm5,xmm3
-	movdqa	xmm3,[16+ebp]
-	movq	mm1,mm4
-	paddq	xmm1,xmm5
-	movq	mm7,[edx-112]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[16+esp],mm4
-	paddq	xmm3,xmm1
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[48+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[40+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[8+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[56+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[16+esp]
-	paddq	mm2,mm6
-	movq	mm6,[24+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-104]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[8+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[40+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[32+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[48+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[8+esp]
-	paddq	mm0,mm6
-	movq	mm6,[16+esp]
-	movdqa	[edx-112],xmm3
-	movdqa	xmm5,xmm7
-	movdqa	xmm3,xmm4
-db	102,15,58,15,226,8
-	movdqa	[32+edx],xmm6
-db	102,15,58,15,238,8
-	movdqa	xmm6,xmm4
-	psrlq	xmm4,7
-	paddq	xmm2,xmm5
-	movdqa	xmm5,xmm6
-	psrlq	xmm6,1
-	psllq	xmm5,56
-	pxor	xmm4,xmm6
-	psrlq	xmm6,7
-	pxor	xmm4,xmm5
-	psllq	xmm5,7
-	pxor	xmm4,xmm6
-	movdqa	xmm6,xmm1
-	pxor	xmm4,xmm5
-	movdqa	xmm5,xmm1
-	psrlq	xmm6,6
-	paddq	xmm2,xmm4
-	movdqa	xmm4,xmm1
-	psrlq	xmm5,19
-	psllq	xmm4,3
-	pxor	xmm6,xmm5
-	psrlq	xmm5,42
-	pxor	xmm6,xmm4
-	psllq	xmm4,42
-	pxor	xmm6,xmm5
-	movdqa	xmm5,[edx]
-	pxor	xmm6,xmm4
-	movdqa	xmm4,[32+ebp]
-	movq	mm1,mm4
-	paddq	xmm2,xmm6
-	movq	mm7,[edx-96]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[esp],mm4
-	paddq	xmm4,xmm2
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[32+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[24+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[56+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[40+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[esp]
-	paddq	mm2,mm6
-	movq	mm6,[8+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-88]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[56+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[24+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[16+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[48+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[32+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[56+esp]
-	paddq	mm0,mm6
-	movq	mm6,[esp]
-	movdqa	[edx-96],xmm4
-	movdqa	xmm6,xmm0
-	movdqa	xmm4,xmm5
-db	102,15,58,15,235,8
-	movdqa	[48+edx],xmm7
-db	102,15,58,15,247,8
-	movdqa	xmm7,xmm5
-	psrlq	xmm5,7
-	paddq	xmm3,xmm6
-	movdqa	xmm6,xmm7
-	psrlq	xmm7,1
-	psllq	xmm6,56
-	pxor	xmm5,xmm7
-	psrlq	xmm7,7
-	pxor	xmm5,xmm6
-	psllq	xmm6,7
-	pxor	xmm5,xmm7
-	movdqa	xmm7,xmm2
-	pxor	xmm5,xmm6
-	movdqa	xmm6,xmm2
-	psrlq	xmm7,6
-	paddq	xmm3,xmm5
-	movdqa	xmm5,xmm2
-	psrlq	xmm6,19
-	psllq	xmm5,3
-	pxor	xmm7,xmm6
-	psrlq	xmm6,42
-	pxor	xmm7,xmm5
-	psllq	xmm5,42
-	pxor	xmm7,xmm6
-	movdqa	xmm6,[16+edx]
-	pxor	xmm7,xmm5
-	movdqa	xmm5,[48+ebp]
-	movq	mm1,mm4
-	paddq	xmm3,xmm7
-	movq	mm7,[edx-80]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[48+esp],mm4
-	paddq	xmm5,xmm3
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[16+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[8+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[40+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[24+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[48+esp]
-	paddq	mm2,mm6
-	movq	mm6,[56+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-72]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[40+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[8+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[32+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[16+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[40+esp]
-	paddq	mm0,mm6
-	movq	mm6,[48+esp]
-	movdqa	[edx-80],xmm5
-	movdqa	xmm7,xmm1
-	movdqa	xmm5,xmm6
-db	102,15,58,15,244,8
-	movdqa	[edx],xmm0
-db	102,15,58,15,248,8
-	movdqa	xmm0,xmm6
-	psrlq	xmm6,7
-	paddq	xmm4,xmm7
-	movdqa	xmm7,xmm0
-	psrlq	xmm0,1
-	psllq	xmm7,56
-	pxor	xmm6,xmm0
-	psrlq	xmm0,7
-	pxor	xmm6,xmm7
-	psllq	xmm7,7
-	pxor	xmm6,xmm0
-	movdqa	xmm0,xmm3
-	pxor	xmm6,xmm7
-	movdqa	xmm7,xmm3
-	psrlq	xmm0,6
-	paddq	xmm4,xmm6
-	movdqa	xmm6,xmm3
-	psrlq	xmm7,19
-	psllq	xmm6,3
-	pxor	xmm0,xmm7
-	psrlq	xmm7,42
-	pxor	xmm0,xmm6
-	psllq	xmm6,42
-	pxor	xmm0,xmm7
-	movdqa	xmm7,[32+edx]
-	pxor	xmm0,xmm6
-	movdqa	xmm6,[64+ebp]
-	movq	mm1,mm4
-	paddq	xmm4,xmm0
-	movq	mm7,[edx-64]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[32+esp],mm4
-	paddq	xmm6,xmm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[56+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[24+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[8+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[32+esp]
-	paddq	mm2,mm6
-	movq	mm6,[40+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-56]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[24+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[56+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[48+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[16+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[24+esp]
-	paddq	mm0,mm6
-	movq	mm6,[32+esp]
-	movdqa	[edx-64],xmm6
-	movdqa	xmm0,xmm2
-	movdqa	xmm6,xmm7
-db	102,15,58,15,253,8
-	movdqa	[16+edx],xmm1
-db	102,15,58,15,193,8
-	movdqa	xmm1,xmm7
-	psrlq	xmm7,7
-	paddq	xmm5,xmm0
-	movdqa	xmm0,xmm1
-	psrlq	xmm1,1
-	psllq	xmm0,56
-	pxor	xmm7,xmm1
-	psrlq	xmm1,7
-	pxor	xmm7,xmm0
-	psllq	xmm0,7
-	pxor	xmm7,xmm1
-	movdqa	xmm1,xmm4
-	pxor	xmm7,xmm0
-	movdqa	xmm0,xmm4
-	psrlq	xmm1,6
-	paddq	xmm5,xmm7
-	movdqa	xmm7,xmm4
-	psrlq	xmm0,19
-	psllq	xmm7,3
-	pxor	xmm1,xmm0
-	psrlq	xmm0,42
-	pxor	xmm1,xmm7
-	psllq	xmm7,42
-	pxor	xmm1,xmm0
-	movdqa	xmm0,[48+edx]
-	pxor	xmm1,xmm7
-	movdqa	xmm7,[80+ebp]
-	movq	mm1,mm4
-	paddq	xmm5,xmm1
-	movq	mm7,[edx-48]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[16+esp],mm4
-	paddq	xmm7,xmm5
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[48+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[40+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[8+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[56+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[16+esp]
-	paddq	mm2,mm6
-	movq	mm6,[24+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-40]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[8+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[40+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[32+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[48+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[8+esp]
-	paddq	mm0,mm6
-	movq	mm6,[16+esp]
-	movdqa	[edx-48],xmm7
-	movdqa	xmm1,xmm3
-	movdqa	xmm7,xmm0
-db	102,15,58,15,198,8
-	movdqa	[32+edx],xmm2
-db	102,15,58,15,202,8
-	movdqa	xmm2,xmm0
-	psrlq	xmm0,7
-	paddq	xmm6,xmm1
-	movdqa	xmm1,xmm2
-	psrlq	xmm2,1
-	psllq	xmm1,56
-	pxor	xmm0,xmm2
-	psrlq	xmm2,7
-	pxor	xmm0,xmm1
-	psllq	xmm1,7
-	pxor	xmm0,xmm2
-	movdqa	xmm2,xmm5
-	pxor	xmm0,xmm1
-	movdqa	xmm1,xmm5
-	psrlq	xmm2,6
-	paddq	xmm6,xmm0
-	movdqa	xmm0,xmm5
-	psrlq	xmm1,19
-	psllq	xmm0,3
-	pxor	xmm2,xmm1
-	psrlq	xmm1,42
-	pxor	xmm2,xmm0
-	psllq	xmm0,42
-	pxor	xmm2,xmm1
-	movdqa	xmm1,[edx]
-	pxor	xmm2,xmm0
-	movdqa	xmm0,[96+ebp]
-	movq	mm1,mm4
-	paddq	xmm6,xmm2
-	movq	mm7,[edx-32]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[esp],mm4
-	paddq	xmm0,xmm6
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[32+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[24+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[56+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[40+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[esp]
-	paddq	mm2,mm6
-	movq	mm6,[8+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-24]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[56+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[24+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[16+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[48+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[32+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[56+esp]
-	paddq	mm0,mm6
-	movq	mm6,[esp]
-	movdqa	[edx-32],xmm0
-	movdqa	xmm2,xmm4
-	movdqa	xmm0,xmm1
-db	102,15,58,15,207,8
-	movdqa	[48+edx],xmm3
-db	102,15,58,15,211,8
-	movdqa	xmm3,xmm1
-	psrlq	xmm1,7
-	paddq	xmm7,xmm2
-	movdqa	xmm2,xmm3
-	psrlq	xmm3,1
-	psllq	xmm2,56
-	pxor	xmm1,xmm3
-	psrlq	xmm3,7
-	pxor	xmm1,xmm2
-	psllq	xmm2,7
-	pxor	xmm1,xmm3
-	movdqa	xmm3,xmm6
-	pxor	xmm1,xmm2
-	movdqa	xmm2,xmm6
-	psrlq	xmm3,6
-	paddq	xmm7,xmm1
-	movdqa	xmm1,xmm6
-	psrlq	xmm2,19
-	psllq	xmm1,3
-	pxor	xmm3,xmm2
-	psrlq	xmm2,42
-	pxor	xmm3,xmm1
-	psllq	xmm1,42
-	pxor	xmm3,xmm2
-	movdqa	xmm2,[16+edx]
-	pxor	xmm3,xmm1
-	movdqa	xmm1,[112+ebp]
-	movq	mm1,mm4
-	paddq	xmm7,xmm3
-	movq	mm7,[edx-16]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[48+esp],mm4
-	paddq	xmm1,xmm7
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[16+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[8+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[40+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[24+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[48+esp]
-	paddq	mm2,mm6
-	movq	mm6,[56+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-8]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[40+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[8+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[32+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[16+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[40+esp]
-	paddq	mm0,mm6
-	movq	mm6,[48+esp]
-	movdqa	[edx-16],xmm1
-	lea	ebp,[128+ebp]
-	dec	ecx
-	jnz	NEAR L$00800_47_ssse3
-	movdqa	xmm1,[ebp]
-	lea	ebp,[ebp-640]
-	movdqu	xmm0,[ebx]
-db	102,15,56,0,193
-	movdqa	xmm3,[ebp]
-	movdqa	xmm2,xmm1
-	movdqu	xmm1,[16+ebx]
-	paddq	xmm3,xmm0
-db	102,15,56,0,202
-	movq	mm1,mm4
-	movq	mm7,[edx-128]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[32+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[56+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[24+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[8+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[32+esp]
-	paddq	mm2,mm6
-	movq	mm6,[40+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-120]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[24+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[56+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[48+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[16+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[24+esp]
-	paddq	mm0,mm6
-	movq	mm6,[32+esp]
-	movdqa	[edx-128],xmm3
-	movdqa	xmm4,[16+ebp]
-	movdqa	xmm3,xmm2
-	movdqu	xmm2,[32+ebx]
-	paddq	xmm4,xmm1
-db	102,15,56,0,211
-	movq	mm1,mm4
-	movq	mm7,[edx-112]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[16+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[48+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[40+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[8+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[56+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[16+esp]
-	paddq	mm2,mm6
-	movq	mm6,[24+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-104]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[8+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[40+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[32+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[48+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[8+esp]
-	paddq	mm0,mm6
-	movq	mm6,[16+esp]
-	movdqa	[edx-112],xmm4
-	movdqa	xmm5,[32+ebp]
-	movdqa	xmm4,xmm3
-	movdqu	xmm3,[48+ebx]
-	paddq	xmm5,xmm2
-db	102,15,56,0,220
-	movq	mm1,mm4
-	movq	mm7,[edx-96]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[32+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[24+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[56+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[40+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[esp]
-	paddq	mm2,mm6
-	movq	mm6,[8+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-88]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[56+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[24+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[16+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[48+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[32+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[56+esp]
-	paddq	mm0,mm6
-	movq	mm6,[esp]
-	movdqa	[edx-96],xmm5
-	movdqa	xmm6,[48+ebp]
-	movdqa	xmm5,xmm4
-	movdqu	xmm4,[64+ebx]
-	paddq	xmm6,xmm3
-db	102,15,56,0,229
-	movq	mm1,mm4
-	movq	mm7,[edx-80]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[48+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[16+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[8+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[40+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[24+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[48+esp]
-	paddq	mm2,mm6
-	movq	mm6,[56+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-72]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[40+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[8+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[32+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[16+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[40+esp]
-	paddq	mm0,mm6
-	movq	mm6,[48+esp]
-	movdqa	[edx-80],xmm6
-	movdqa	xmm7,[64+ebp]
-	movdqa	xmm6,xmm5
-	movdqu	xmm5,[80+ebx]
-	paddq	xmm7,xmm4
-db	102,15,56,0,238
-	movq	mm1,mm4
-	movq	mm7,[edx-64]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[32+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[56+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[24+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[8+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[32+esp]
-	paddq	mm2,mm6
-	movq	mm6,[40+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-56]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[24+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[56+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[48+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[16+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[24+esp]
-	paddq	mm0,mm6
-	movq	mm6,[32+esp]
-	movdqa	[edx-64],xmm7
-	movdqa	[edx],xmm0
-	movdqa	xmm0,[80+ebp]
-	movdqa	xmm7,xmm6
-	movdqu	xmm6,[96+ebx]
-	paddq	xmm0,xmm5
-db	102,15,56,0,247
-	movq	mm1,mm4
-	movq	mm7,[edx-48]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[16+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[48+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[40+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[8+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[56+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[16+esp]
-	paddq	mm2,mm6
-	movq	mm6,[24+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-40]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[8+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[40+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[32+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[48+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[8+esp]
-	paddq	mm0,mm6
-	movq	mm6,[16+esp]
-	movdqa	[edx-48],xmm0
-	movdqa	[16+edx],xmm1
-	movdqa	xmm1,[96+ebp]
-	movdqa	xmm0,xmm7
-	movdqu	xmm7,[112+ebx]
-	paddq	xmm1,xmm6
-db	102,15,56,0,248
-	movq	mm1,mm4
-	movq	mm7,[edx-32]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[32+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[24+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[56+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[40+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[esp]
-	paddq	mm2,mm6
-	movq	mm6,[8+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-24]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[56+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[24+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[16+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[48+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[32+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[56+esp]
-	paddq	mm0,mm6
-	movq	mm6,[esp]
-	movdqa	[edx-32],xmm1
-	movdqa	[32+edx],xmm2
-	movdqa	xmm2,[112+ebp]
-	movdqa	xmm0,[edx]
-	paddq	xmm2,xmm7
-	movq	mm1,mm4
-	movq	mm7,[edx-16]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[48+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm0,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[16+esp],mm0
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[8+esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[40+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm0
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm0
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[24+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm2,mm0
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	pxor	mm6,mm7
-	movq	mm5,[48+esp]
-	paddq	mm2,mm6
-	movq	mm6,[56+esp]
-	movq	mm1,mm4
-	movq	mm7,[edx-8]
-	pxor	mm5,mm6
-	psrlq	mm1,14
-	movq	[40+esp],mm4
-	pand	mm5,mm4
-	psllq	mm4,23
-	paddq	mm2,mm3
-	movq	mm3,mm1
-	psrlq	mm1,4
-	pxor	mm5,mm6
-	pxor	mm3,mm4
-	psllq	mm4,23
-	pxor	mm3,mm1
-	movq	[8+esp],mm2
-	paddq	mm7,mm5
-	pxor	mm3,mm4
-	psrlq	mm1,23
-	paddq	mm7,[esp]
-	pxor	mm3,mm1
-	psllq	mm4,4
-	pxor	mm3,mm4
-	movq	mm4,[32+esp]
-	paddq	mm3,mm7
-	movq	mm5,mm2
-	psrlq	mm5,28
-	paddq	mm4,mm3
-	movq	mm6,mm2
-	movq	mm7,mm5
-	psllq	mm6,25
-	movq	mm1,[16+esp]
-	psrlq	mm5,6
-	pxor	mm7,mm6
-	psllq	mm6,5
-	pxor	mm7,mm5
-	pxor	mm2,mm1
-	psrlq	mm5,5
-	pxor	mm7,mm6
-	pand	mm0,mm2
-	psllq	mm6,6
-	pxor	mm7,mm5
-	pxor	mm0,mm1
-	pxor	mm6,mm7
-	movq	mm5,[40+esp]
-	paddq	mm0,mm6
-	movq	mm6,[48+esp]
-	movdqa	[edx-16],xmm2
-	movq	mm1,[8+esp]
-	paddq	mm0,mm3
-	movq	mm3,[24+esp]
-	movq	mm7,[56+esp]
-	pxor	mm2,mm1
-	paddq	mm0,[esi]
-	paddq	mm1,[8+esi]
-	paddq	mm2,[16+esi]
-	paddq	mm3,[24+esi]
-	paddq	mm4,[32+esi]
-	paddq	mm5,[40+esi]
-	paddq	mm6,[48+esi]
-	paddq	mm7,[56+esi]
-	movq	[esi],mm0
-	movq	[8+esi],mm1
-	movq	[16+esi],mm2
-	movq	[24+esi],mm3
-	movq	[32+esi],mm4
-	movq	[40+esi],mm5
-	movq	[48+esi],mm6
-	movq	[56+esi],mm7
-	cmp	edi,eax
-	jb	NEAR L$007loop_ssse3
-	mov	esp,DWORD [76+edx]
-	emms
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-align	16
-L$002loop_x86:
-	mov	eax,DWORD [edi]
-	mov	ebx,DWORD [4+edi]
-	mov	ecx,DWORD [8+edi]
-	mov	edx,DWORD [12+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [16+edi]
-	mov	ebx,DWORD [20+edi]
-	mov	ecx,DWORD [24+edi]
-	mov	edx,DWORD [28+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [32+edi]
-	mov	ebx,DWORD [36+edi]
-	mov	ecx,DWORD [40+edi]
-	mov	edx,DWORD [44+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [48+edi]
-	mov	ebx,DWORD [52+edi]
-	mov	ecx,DWORD [56+edi]
-	mov	edx,DWORD [60+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [64+edi]
-	mov	ebx,DWORD [68+edi]
-	mov	ecx,DWORD [72+edi]
-	mov	edx,DWORD [76+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [80+edi]
-	mov	ebx,DWORD [84+edi]
-	mov	ecx,DWORD [88+edi]
-	mov	edx,DWORD [92+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [96+edi]
-	mov	ebx,DWORD [100+edi]
-	mov	ecx,DWORD [104+edi]
-	mov	edx,DWORD [108+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [112+edi]
-	mov	ebx,DWORD [116+edi]
-	mov	ecx,DWORD [120+edi]
-	mov	edx,DWORD [124+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	add	edi,128
-	sub	esp,72
-	mov	DWORD [204+esp],edi
-	lea	edi,[8+esp]
-	mov	ecx,16
-dd	2784229001
-align	16
-L$00900_15_x86:
-	mov	ecx,DWORD [40+esp]
-	mov	edx,DWORD [44+esp]
-	mov	esi,ecx
-	shr	ecx,9
-	mov	edi,edx
-	shr	edx,9
-	mov	ebx,ecx
-	shl	esi,14
-	mov	eax,edx
-	shl	edi,14
-	xor	ebx,esi
-	shr	ecx,5
-	xor	eax,edi
-	shr	edx,5
-	xor	eax,ecx
-	shl	esi,4
-	xor	ebx,edx
-	shl	edi,4
-	xor	ebx,esi
-	shr	ecx,4
-	xor	eax,edi
-	shr	edx,4
-	xor	eax,ecx
-	shl	esi,5
-	xor	ebx,edx
-	shl	edi,5
-	xor	eax,esi
-	xor	ebx,edi
-	mov	ecx,DWORD [48+esp]
-	mov	edx,DWORD [52+esp]
-	mov	esi,DWORD [56+esp]
-	mov	edi,DWORD [60+esp]
-	add	eax,DWORD [64+esp]
-	adc	ebx,DWORD [68+esp]
-	xor	ecx,esi
-	xor	edx,edi
-	and	ecx,DWORD [40+esp]
-	and	edx,DWORD [44+esp]
-	add	eax,DWORD [192+esp]
-	adc	ebx,DWORD [196+esp]
-	xor	ecx,esi
-	xor	edx,edi
-	mov	esi,DWORD [ebp]
-	mov	edi,DWORD [4+ebp]
-	add	eax,ecx
-	adc	ebx,edx
-	mov	ecx,DWORD [32+esp]
-	mov	edx,DWORD [36+esp]
-	add	eax,esi
-	adc	ebx,edi
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	add	eax,ecx
-	adc	ebx,edx
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	mov	DWORD [32+esp],eax
-	mov	DWORD [36+esp],ebx
-	mov	esi,ecx
-	shr	ecx,2
-	mov	edi,edx
-	shr	edx,2
-	mov	ebx,ecx
-	shl	esi,4
-	mov	eax,edx
-	shl	edi,4
-	xor	ebx,esi
-	shr	ecx,5
-	xor	eax,edi
-	shr	edx,5
-	xor	ebx,ecx
-	shl	esi,21
-	xor	eax,edx
-	shl	edi,21
-	xor	eax,esi
-	shr	ecx,21
-	xor	ebx,edi
-	shr	edx,21
-	xor	eax,ecx
-	shl	esi,5
-	xor	ebx,edx
-	shl	edi,5
-	xor	eax,esi
-	xor	ebx,edi
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	mov	esi,DWORD [16+esp]
-	mov	edi,DWORD [20+esp]
-	add	eax,DWORD [esp]
-	adc	ebx,DWORD [4+esp]
-	or	ecx,esi
-	or	edx,edi
-	and	ecx,DWORD [24+esp]
-	and	edx,DWORD [28+esp]
-	and	esi,DWORD [8+esp]
-	and	edi,DWORD [12+esp]
-	or	ecx,esi
-	or	edx,edi
-	add	eax,ecx
-	adc	ebx,edx
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	mov	dl,BYTE [ebp]
-	sub	esp,8
-	lea	ebp,[8+ebp]
-	cmp	dl,148
-	jne	NEAR L$00900_15_x86
-align	16
-L$01016_79_x86:
-	mov	ecx,DWORD [312+esp]
-	mov	edx,DWORD [316+esp]
-	mov	esi,ecx
-	shr	ecx,1
-	mov	edi,edx
-	shr	edx,1
-	mov	eax,ecx
-	shl	esi,24
-	mov	ebx,edx
-	shl	edi,24
-	xor	ebx,esi
-	shr	ecx,6
-	xor	eax,edi
-	shr	edx,6
-	xor	eax,ecx
-	shl	esi,7
-	xor	ebx,edx
-	shl	edi,1
-	xor	ebx,esi
-	shr	ecx,1
-	xor	eax,edi
-	shr	edx,1
-	xor	eax,ecx
-	shl	edi,6
-	xor	ebx,edx
-	xor	eax,edi
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	mov	ecx,DWORD [208+esp]
-	mov	edx,DWORD [212+esp]
-	mov	esi,ecx
-	shr	ecx,6
-	mov	edi,edx
-	shr	edx,6
-	mov	eax,ecx
-	shl	esi,3
-	mov	ebx,edx
-	shl	edi,3
-	xor	eax,esi
-	shr	ecx,13
-	xor	ebx,edi
-	shr	edx,13
-	xor	eax,ecx
-	shl	esi,10
-	xor	ebx,edx
-	shl	edi,10
-	xor	ebx,esi
-	shr	ecx,10
-	xor	eax,edi
-	shr	edx,10
-	xor	ebx,ecx
-	shl	edi,13
-	xor	eax,edx
-	xor	eax,edi
-	mov	ecx,DWORD [320+esp]
-	mov	edx,DWORD [324+esp]
-	add	eax,DWORD [esp]
-	adc	ebx,DWORD [4+esp]
-	mov	esi,DWORD [248+esp]
-	mov	edi,DWORD [252+esp]
-	add	eax,ecx
-	adc	ebx,edx
-	add	eax,esi
-	adc	ebx,edi
-	mov	DWORD [192+esp],eax
-	mov	DWORD [196+esp],ebx
-	mov	ecx,DWORD [40+esp]
-	mov	edx,DWORD [44+esp]
-	mov	esi,ecx
-	shr	ecx,9
-	mov	edi,edx
-	shr	edx,9
-	mov	ebx,ecx
-	shl	esi,14
-	mov	eax,edx
-	shl	edi,14
-	xor	ebx,esi
-	shr	ecx,5
-	xor	eax,edi
-	shr	edx,5
-	xor	eax,ecx
-	shl	esi,4
-	xor	ebx,edx
-	shl	edi,4
-	xor	ebx,esi
-	shr	ecx,4
-	xor	eax,edi
-	shr	edx,4
-	xor	eax,ecx
-	shl	esi,5
-	xor	ebx,edx
-	shl	edi,5
-	xor	eax,esi
-	xor	ebx,edi
-	mov	ecx,DWORD [48+esp]
-	mov	edx,DWORD [52+esp]
-	mov	esi,DWORD [56+esp]
-	mov	edi,DWORD [60+esp]
-	add	eax,DWORD [64+esp]
-	adc	ebx,DWORD [68+esp]
-	xor	ecx,esi
-	xor	edx,edi
-	and	ecx,DWORD [40+esp]
-	and	edx,DWORD [44+esp]
-	add	eax,DWORD [192+esp]
-	adc	ebx,DWORD [196+esp]
-	xor	ecx,esi
-	xor	edx,edi
-	mov	esi,DWORD [ebp]
-	mov	edi,DWORD [4+ebp]
-	add	eax,ecx
-	adc	ebx,edx
-	mov	ecx,DWORD [32+esp]
-	mov	edx,DWORD [36+esp]
-	add	eax,esi
-	adc	ebx,edi
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	add	eax,ecx
-	adc	ebx,edx
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	mov	DWORD [32+esp],eax
-	mov	DWORD [36+esp],ebx
-	mov	esi,ecx
-	shr	ecx,2
-	mov	edi,edx
-	shr	edx,2
-	mov	ebx,ecx
-	shl	esi,4
-	mov	eax,edx
-	shl	edi,4
-	xor	ebx,esi
-	shr	ecx,5
-	xor	eax,edi
-	shr	edx,5
-	xor	ebx,ecx
-	shl	esi,21
-	xor	eax,edx
-	shl	edi,21
-	xor	eax,esi
-	shr	ecx,21
-	xor	ebx,edi
-	shr	edx,21
-	xor	eax,ecx
-	shl	esi,5
-	xor	ebx,edx
-	shl	edi,5
-	xor	eax,esi
-	xor	ebx,edi
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	mov	esi,DWORD [16+esp]
-	mov	edi,DWORD [20+esp]
-	add	eax,DWORD [esp]
-	adc	ebx,DWORD [4+esp]
-	or	ecx,esi
-	or	edx,edi
-	and	ecx,DWORD [24+esp]
-	and	edx,DWORD [28+esp]
-	and	esi,DWORD [8+esp]
-	and	edi,DWORD [12+esp]
-	or	ecx,esi
-	or	edx,edi
-	add	eax,ecx
-	adc	ebx,edx
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	mov	dl,BYTE [ebp]
-	sub	esp,8
-	lea	ebp,[8+ebp]
-	cmp	dl,23
-	jne	NEAR L$01016_79_x86
-	mov	esi,DWORD [840+esp]
-	mov	edi,DWORD [844+esp]
-	mov	eax,DWORD [esi]
-	mov	ebx,DWORD [4+esi]
-	mov	ecx,DWORD [8+esi]
-	mov	edx,DWORD [12+esi]
-	add	eax,DWORD [8+esp]
-	adc	ebx,DWORD [12+esp]
-	mov	DWORD [esi],eax
-	mov	DWORD [4+esi],ebx
-	add	ecx,DWORD [16+esp]
-	adc	edx,DWORD [20+esp]
-	mov	DWORD [8+esi],ecx
-	mov	DWORD [12+esi],edx
-	mov	eax,DWORD [16+esi]
-	mov	ebx,DWORD [20+esi]
-	mov	ecx,DWORD [24+esi]
-	mov	edx,DWORD [28+esi]
-	add	eax,DWORD [24+esp]
-	adc	ebx,DWORD [28+esp]
-	mov	DWORD [16+esi],eax
-	mov	DWORD [20+esi],ebx
-	add	ecx,DWORD [32+esp]
-	adc	edx,DWORD [36+esp]
-	mov	DWORD [24+esi],ecx
-	mov	DWORD [28+esi],edx
-	mov	eax,DWORD [32+esi]
-	mov	ebx,DWORD [36+esi]
-	mov	ecx,DWORD [40+esi]
-	mov	edx,DWORD [44+esi]
-	add	eax,DWORD [40+esp]
-	adc	ebx,DWORD [44+esp]
-	mov	DWORD [32+esi],eax
-	mov	DWORD [36+esi],ebx
-	add	ecx,DWORD [48+esp]
-	adc	edx,DWORD [52+esp]
-	mov	DWORD [40+esi],ecx
-	mov	DWORD [44+esi],edx
-	mov	eax,DWORD [48+esi]
-	mov	ebx,DWORD [52+esi]
-	mov	ecx,DWORD [56+esi]
-	mov	edx,DWORD [60+esi]
-	add	eax,DWORD [56+esp]
-	adc	ebx,DWORD [60+esp]
-	mov	DWORD [48+esi],eax
-	mov	DWORD [52+esi],ebx
-	add	ecx,DWORD [64+esp]
-	adc	edx,DWORD [68+esp]
-	mov	DWORD [56+esi],ecx
-	mov	DWORD [60+esi],edx
-	add	esp,840
-	sub	ebp,640
-	cmp	edi,DWORD [8+esp]
-	jb	NEAR L$002loop_x86
-	mov	esp,DWORD [12+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-align	64
-L$001K512:
-dd	3609767458,1116352408
-dd	602891725,1899447441
-dd	3964484399,3049323471
-dd	2173295548,3921009573
-dd	4081628472,961987163
-dd	3053834265,1508970993
-dd	2937671579,2453635748
-dd	3664609560,2870763221
-dd	2734883394,3624381080
-dd	1164996542,310598401
-dd	1323610764,607225278
-dd	3590304994,1426881987
-dd	4068182383,1925078388
-dd	991336113,2162078206
-dd	633803317,2614888103
-dd	3479774868,3248222580
-dd	2666613458,3835390401
-dd	944711139,4022224774
-dd	2341262773,264347078
-dd	2007800933,604807628
-dd	1495990901,770255983
-dd	1856431235,1249150122
-dd	3175218132,1555081692
-dd	2198950837,1996064986
-dd	3999719339,2554220882
-dd	766784016,2821834349
-dd	2566594879,2952996808
-dd	3203337956,3210313671
-dd	1034457026,3336571891
-dd	2466948901,3584528711
-dd	3758326383,113926993
-dd	168717936,338241895
-dd	1188179964,666307205
-dd	1546045734,773529912
-dd	1522805485,1294757372
-dd	2643833823,1396182291
-dd	2343527390,1695183700
-dd	1014477480,1986661051
-dd	1206759142,2177026350
-dd	344077627,2456956037
-dd	1290863460,2730485921
-dd	3158454273,2820302411
-dd	3505952657,3259730800
-dd	106217008,3345764771
-dd	3606008344,3516065817
-dd	1432725776,3600352804
-dd	1467031594,4094571909
-dd	851169720,275423344
-dd	3100823752,430227734
-dd	1363258195,506948616
-dd	3750685593,659060556
-dd	3785050280,883997877
-dd	3318307427,958139571
-dd	3812723403,1322822218
-dd	2003034995,1537002063
-dd	3602036899,1747873779
-dd	1575990012,1955562222
-dd	1125592928,2024104815
-dd	2716904306,2227730452
-dd	442776044,2361852424
-dd	593698344,2428436474
-dd	3733110249,2756734187
-dd	2999351573,3204031479
-dd	3815920427,3329325298
-dd	3928383900,3391569614
-dd	566280711,3515267271
-dd	3454069534,3940187606
-dd	4000239992,4118630271
-dd	1914138554,116418474
-dd	2731055270,174292421
-dd	3203993006,289380356
-dd	320620315,460393269
-dd	587496836,685471733
-dd	1086792851,852142971
-dd	365543100,1017036298
-dd	2618297676,1126000580
-dd	3409855158,1288033470
-dd	4234509866,1501505948
-dd	987167468,1607167915
-dd	1246189591,1816402316
-dd	67438087,66051
-dd	202182159,134810123
-db	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
-db	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-db	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-db	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-db	62,0
-segment	.bss
-common	_OPENSSL_ia32cap_P 16
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-ios.ios.arm.S
deleted file mode 100644
index c587ac01..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-ios.ios.arm.S
+++ /dev/null
@@ -1,1866 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__APPLE__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
-@
-@ Licensed under the OpenSSL license (the "License").  You may not use
-@ this file except in compliance with the License.  You can obtain a copy
-@ in the file LICENSE in the source distribution or at
-@ https://www.openssl.org/source/license.html
-
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@
-@ Permission to use under GPL terms is granted.
-@ ====================================================================
-
-@ SHA512 block procedure for ARMv4. September 2007.
-
-@ This code is ~4.5 (four and a half) times faster than code generated
-@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
-@ Xscale PXA250 core].
-@
-@ July 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
-@ Cortex A8 core and ~40 cycles per processed byte.
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 7%
-@ improvement on Coxtex A8 core and ~38 cycles per byte.
-
-@ March 2011.
-@
-@ Add NEON implementation. On Cortex A8 it was measured to process
-@ one byte in 23.3 cycles or ~60% faster than integer-only code.
-
-@ August 2012.
-@
-@ Improve NEON performance by 12% on Snapdragon S4. In absolute
-@ terms it's 22.6 cycles per byte, which is disappointing result.
-@ Technical writers asserted that 3-way S4 pipeline can sustain
-@ multiple NEON instructions per cycle, but dual NEON issue could
-@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
-@ for further details. On side note Cortex-A15 processes one byte in
-@ 16 cycles.
-
-@ Byte order [in]dependence. =========================================
-@
-@ Originally caller was expected to maintain specific *dword* order in
-@ h[0-7], namely with most significant dword at *lower* address, which
-@ was reflected in below two parameters as 0 and 4. Now caller is
-@ expected to maintain native byte order for whole 64-bit values.
-#ifndef __KERNEL__
-# include <CCryptoBoringSSL_arm_arch.h>
-# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
-# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
-#else
-# define __ARM_MAX_ARCH__ 7
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-#endif
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
-
-
-#ifdef __ARMEL__
-# define LO 0
-# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
-#else
-# define HI 0
-# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
-#endif
-
-.text
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-# define adrl adr
-#else
-.code	32
-#endif
-
-
-.align	5
-K512:
-	WORD64(0x428a2f98,0xd728ae22,	0x71374491,0x23ef65cd)
-	WORD64(0xb5c0fbcf,0xec4d3b2f,	0xe9b5dba5,0x8189dbbc)
-	WORD64(0x3956c25b,0xf348b538,	0x59f111f1,0xb605d019)
-	WORD64(0x923f82a4,0xaf194f9b,	0xab1c5ed5,0xda6d8118)
-	WORD64(0xd807aa98,0xa3030242,	0x12835b01,0x45706fbe)
-	WORD64(0x243185be,0x4ee4b28c,	0x550c7dc3,0xd5ffb4e2)
-	WORD64(0x72be5d74,0xf27b896f,	0x80deb1fe,0x3b1696b1)
-	WORD64(0x9bdc06a7,0x25c71235,	0xc19bf174,0xcf692694)
-	WORD64(0xe49b69c1,0x9ef14ad2,	0xefbe4786,0x384f25e3)
-	WORD64(0x0fc19dc6,0x8b8cd5b5,	0x240ca1cc,0x77ac9c65)
-	WORD64(0x2de92c6f,0x592b0275,	0x4a7484aa,0x6ea6e483)
-	WORD64(0x5cb0a9dc,0xbd41fbd4,	0x76f988da,0x831153b5)
-	WORD64(0x983e5152,0xee66dfab,	0xa831c66d,0x2db43210)
-	WORD64(0xb00327c8,0x98fb213f,	0xbf597fc7,0xbeef0ee4)
-	WORD64(0xc6e00bf3,0x3da88fc2,	0xd5a79147,0x930aa725)
-	WORD64(0x06ca6351,0xe003826f,	0x14292967,0x0a0e6e70)
-	WORD64(0x27b70a85,0x46d22ffc,	0x2e1b2138,0x5c26c926)
-	WORD64(0x4d2c6dfc,0x5ac42aed,	0x53380d13,0x9d95b3df)
-	WORD64(0x650a7354,0x8baf63de,	0x766a0abb,0x3c77b2a8)
-	WORD64(0x81c2c92e,0x47edaee6,	0x92722c85,0x1482353b)
-	WORD64(0xa2bfe8a1,0x4cf10364,	0xa81a664b,0xbc423001)
-	WORD64(0xc24b8b70,0xd0f89791,	0xc76c51a3,0x0654be30)
-	WORD64(0xd192e819,0xd6ef5218,	0xd6990624,0x5565a910)
-	WORD64(0xf40e3585,0x5771202a,	0x106aa070,0x32bbd1b8)
-	WORD64(0x19a4c116,0xb8d2d0c8,	0x1e376c08,0x5141ab53)
-	WORD64(0x2748774c,0xdf8eeb99,	0x34b0bcb5,0xe19b48a8)
-	WORD64(0x391c0cb3,0xc5c95a63,	0x4ed8aa4a,0xe3418acb)
-	WORD64(0x5b9cca4f,0x7763e373,	0x682e6ff3,0xd6b2b8a3)
-	WORD64(0x748f82ee,0x5defb2fc,	0x78a5636f,0x43172f60)
-	WORD64(0x84c87814,0xa1f0ab72,	0x8cc70208,0x1a6439ec)
-	WORD64(0x90befffa,0x23631e28,	0xa4506ceb,0xde82bde9)
-	WORD64(0xbef9a3f7,0xb2c67915,	0xc67178f2,0xe372532b)
-	WORD64(0xca273ece,0xea26619c,	0xd186b8c7,0x21c0c207)
-	WORD64(0xeada7dd6,0xcde0eb1e,	0xf57d4f7f,0xee6ed178)
-	WORD64(0x06f067aa,0x72176fba,	0x0a637dc5,0xa2c898a6)
-	WORD64(0x113f9804,0xbef90dae,	0x1b710b35,0x131c471b)
-	WORD64(0x28db77f5,0x23047d84,	0x32caab7b,0x40c72493)
-	WORD64(0x3c9ebe0a,0x15c9bebc,	0x431d67c4,0x9c100d4c)
-	WORD64(0x4cc5d4be,0xcb3e42b6,	0x597f299c,0xfc657e2a)
-	WORD64(0x5fcb6fab,0x3ad6faec,	0x6c44198c,0x4a475817)
-
-
-.globl	_sha512_block_data_order_nohw
-.private_extern	_sha512_block_data_order_nohw
-#ifdef __thumb2__
-.thumb_func	_sha512_block_data_order_nohw
-#endif
-_sha512_block_data_order_nohw:
-	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-	adr	r14,K512
-	sub	sp,sp,#9*8
-
-	ldr	r7,[r0,#32+LO]
-	ldr	r8,[r0,#32+HI]
-	ldr	r9, [r0,#48+LO]
-	ldr	r10, [r0,#48+HI]
-	ldr	r11, [r0,#56+LO]
-	ldr	r12, [r0,#56+HI]
-Loop:
-	str	r9, [sp,#48+0]
-	str	r10, [sp,#48+4]
-	str	r11, [sp,#56+0]
-	str	r12, [sp,#56+4]
-	ldr	r5,[r0,#0+LO]
-	ldr	r6,[r0,#0+HI]
-	ldr	r3,[r0,#8+LO]
-	ldr	r4,[r0,#8+HI]
-	ldr	r9, [r0,#16+LO]
-	ldr	r10, [r0,#16+HI]
-	ldr	r11, [r0,#24+LO]
-	ldr	r12, [r0,#24+HI]
-	str	r3,[sp,#8+0]
-	str	r4,[sp,#8+4]
-	str	r9, [sp,#16+0]
-	str	r10, [sp,#16+4]
-	str	r11, [sp,#24+0]
-	str	r12, [sp,#24+4]
-	ldr	r3,[r0,#40+LO]
-	ldr	r4,[r0,#40+HI]
-	str	r3,[sp,#40+0]
-	str	r4,[sp,#40+4]
-
-L00_15:
-#if __ARM_ARCH<7
-	ldrb	r3,[r1,#7]
-	ldrb	r9, [r1,#6]
-	ldrb	r10, [r1,#5]
-	ldrb	r11, [r1,#4]
-	ldrb	r4,[r1,#3]
-	ldrb	r12, [r1,#2]
-	orr	r3,r3,r9,lsl#8
-	ldrb	r9, [r1,#1]
-	orr	r3,r3,r10,lsl#16
-	ldrb	r10, [r1],#8
-	orr	r3,r3,r11,lsl#24
-	orr	r4,r4,r12,lsl#8
-	orr	r4,r4,r9,lsl#16
-	orr	r4,r4,r10,lsl#24
-#else
-	ldr	r3,[r1,#4]
-	ldr	r4,[r1],#8
-#ifdef __ARMEL__
-	rev	r3,r3
-	rev	r4,r4
-#endif
-#endif
-	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
-	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
-	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
-	mov	r9,r7,lsr#14
-	str	r3,[sp,#64+0]
-	mov	r10,r8,lsr#14
-	str	r4,[sp,#64+4]
-	eor	r9,r9,r8,lsl#18
-	ldr	r11,[sp,#56+0]	@ h.lo
-	eor	r10,r10,r7,lsl#18
-	ldr	r12,[sp,#56+4]	@ h.hi
-	eor	r9,r9,r7,lsr#18
-	eor	r10,r10,r8,lsr#18
-	eor	r9,r9,r8,lsl#14
-	eor	r10,r10,r7,lsl#14
-	eor	r9,r9,r8,lsr#9
-	eor	r10,r10,r7,lsr#9
-	eor	r9,r9,r7,lsl#23
-	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
-	adds	r3,r3,r9
-	ldr	r9,[sp,#40+0]	@ f.lo
-	adc	r4,r4,r10		@ T += Sigma1(e)
-	ldr	r10,[sp,#40+4]	@ f.hi
-	adds	r3,r3,r11
-	ldr	r11,[sp,#48+0]	@ g.lo
-	adc	r4,r4,r12		@ T += h
-	ldr	r12,[sp,#48+4]	@ g.hi
-
-	eor	r9,r9,r11
-	str	r7,[sp,#32+0]
-	eor	r10,r10,r12
-	str	r8,[sp,#32+4]
-	and	r9,r9,r7
-	str	r5,[sp,#0+0]
-	and	r10,r10,r8
-	str	r6,[sp,#0+4]
-	eor	r9,r9,r11
-	ldr	r11,[r14,#LO]	@ K[i].lo
-	eor	r10,r10,r12		@ Ch(e,f,g)
-	ldr	r12,[r14,#HI]	@ K[i].hi
-
-	adds	r3,r3,r9
-	ldr	r7,[sp,#24+0]	@ d.lo
-	adc	r4,r4,r10		@ T += Ch(e,f,g)
-	ldr	r8,[sp,#24+4]	@ d.hi
-	adds	r3,r3,r11
-	and	r9,r11,#0xff
-	adc	r4,r4,r12		@ T += K[i]
-	adds	r7,r7,r3
-	ldr	r11,[sp,#8+0]	@ b.lo
-	adc	r8,r8,r4		@ d += T
-	teq	r9,#148
-
-	ldr	r12,[sp,#16+0]	@ c.lo
-#if __ARM_ARCH>=7
-	it	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	orreq	r14,r14,#1
-	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
-	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
-	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
-	mov	r9,r5,lsr#28
-	mov	r10,r6,lsr#28
-	eor	r9,r9,r6,lsl#4
-	eor	r10,r10,r5,lsl#4
-	eor	r9,r9,r6,lsr#2
-	eor	r10,r10,r5,lsr#2
-	eor	r9,r9,r5,lsl#30
-	eor	r10,r10,r6,lsl#30
-	eor	r9,r9,r6,lsr#7
-	eor	r10,r10,r5,lsr#7
-	eor	r9,r9,r5,lsl#25
-	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
-	adds	r3,r3,r9
-	and	r9,r5,r11
-	adc	r4,r4,r10		@ T += Sigma0(a)
-
-	ldr	r10,[sp,#8+4]	@ b.hi
-	orr	r5,r5,r11
-	ldr	r11,[sp,#16+4]	@ c.hi
-	and	r5,r5,r12
-	and	r12,r6,r10
-	orr	r6,r6,r10
-	orr	r5,r5,r9		@ Maj(a,b,c).lo
-	and	r6,r6,r11
-	adds	r5,r5,r3
-	orr	r6,r6,r12		@ Maj(a,b,c).hi
-	sub	sp,sp,#8
-	adc	r6,r6,r4		@ h += T
-	tst	r14,#1
-	add	r14,r14,#8
-	tst	r14,#1
-	beq	L00_15
-	ldr	r9,[sp,#184+0]
-	ldr	r10,[sp,#184+4]
-	bic	r14,r14,#1
-L16_79:
-	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
-	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
-	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
-	mov	r3,r9,lsr#1
-	ldr	r11,[sp,#80+0]
-	mov	r4,r10,lsr#1
-	ldr	r12,[sp,#80+4]
-	eor	r3,r3,r10,lsl#31
-	eor	r4,r4,r9,lsl#31
-	eor	r3,r3,r9,lsr#8
-	eor	r4,r4,r10,lsr#8
-	eor	r3,r3,r10,lsl#24
-	eor	r4,r4,r9,lsl#24
-	eor	r3,r3,r9,lsr#7
-	eor	r4,r4,r10,lsr#7
-	eor	r3,r3,r10,lsl#25
-
-	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
-	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
-	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
-	mov	r9,r11,lsr#19
-	mov	r10,r12,lsr#19
-	eor	r9,r9,r12,lsl#13
-	eor	r10,r10,r11,lsl#13
-	eor	r9,r9,r12,lsr#29
-	eor	r10,r10,r11,lsr#29
-	eor	r9,r9,r11,lsl#3
-	eor	r10,r10,r12,lsl#3
-	eor	r9,r9,r11,lsr#6
-	eor	r10,r10,r12,lsr#6
-	ldr	r11,[sp,#120+0]
-	eor	r9,r9,r12,lsl#26
-
-	ldr	r12,[sp,#120+4]
-	adds	r3,r3,r9
-	ldr	r9,[sp,#192+0]
-	adc	r4,r4,r10
-
-	ldr	r10,[sp,#192+4]
-	adds	r3,r3,r11
-	adc	r4,r4,r12
-	adds	r3,r3,r9
-	adc	r4,r4,r10
-	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
-	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
-	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
-	mov	r9,r7,lsr#14
-	str	r3,[sp,#64+0]
-	mov	r10,r8,lsr#14
-	str	r4,[sp,#64+4]
-	eor	r9,r9,r8,lsl#18
-	ldr	r11,[sp,#56+0]	@ h.lo
-	eor	r10,r10,r7,lsl#18
-	ldr	r12,[sp,#56+4]	@ h.hi
-	eor	r9,r9,r7,lsr#18
-	eor	r10,r10,r8,lsr#18
-	eor	r9,r9,r8,lsl#14
-	eor	r10,r10,r7,lsl#14
-	eor	r9,r9,r8,lsr#9
-	eor	r10,r10,r7,lsr#9
-	eor	r9,r9,r7,lsl#23
-	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
-	adds	r3,r3,r9
-	ldr	r9,[sp,#40+0]	@ f.lo
-	adc	r4,r4,r10		@ T += Sigma1(e)
-	ldr	r10,[sp,#40+4]	@ f.hi
-	adds	r3,r3,r11
-	ldr	r11,[sp,#48+0]	@ g.lo
-	adc	r4,r4,r12		@ T += h
-	ldr	r12,[sp,#48+4]	@ g.hi
-
-	eor	r9,r9,r11
-	str	r7,[sp,#32+0]
-	eor	r10,r10,r12
-	str	r8,[sp,#32+4]
-	and	r9,r9,r7
-	str	r5,[sp,#0+0]
-	and	r10,r10,r8
-	str	r6,[sp,#0+4]
-	eor	r9,r9,r11
-	ldr	r11,[r14,#LO]	@ K[i].lo
-	eor	r10,r10,r12		@ Ch(e,f,g)
-	ldr	r12,[r14,#HI]	@ K[i].hi
-
-	adds	r3,r3,r9
-	ldr	r7,[sp,#24+0]	@ d.lo
-	adc	r4,r4,r10		@ T += Ch(e,f,g)
-	ldr	r8,[sp,#24+4]	@ d.hi
-	adds	r3,r3,r11
-	and	r9,r11,#0xff
-	adc	r4,r4,r12		@ T += K[i]
-	adds	r7,r7,r3
-	ldr	r11,[sp,#8+0]	@ b.lo
-	adc	r8,r8,r4		@ d += T
-	teq	r9,#23
-
-	ldr	r12,[sp,#16+0]	@ c.lo
-#if __ARM_ARCH>=7
-	it	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	orreq	r14,r14,#1
-	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
-	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
-	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
-	mov	r9,r5,lsr#28
-	mov	r10,r6,lsr#28
-	eor	r9,r9,r6,lsl#4
-	eor	r10,r10,r5,lsl#4
-	eor	r9,r9,r6,lsr#2
-	eor	r10,r10,r5,lsr#2
-	eor	r9,r9,r5,lsl#30
-	eor	r10,r10,r6,lsl#30
-	eor	r9,r9,r6,lsr#7
-	eor	r10,r10,r5,lsr#7
-	eor	r9,r9,r5,lsl#25
-	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
-	adds	r3,r3,r9
-	and	r9,r5,r11
-	adc	r4,r4,r10		@ T += Sigma0(a)
-
-	ldr	r10,[sp,#8+4]	@ b.hi
-	orr	r5,r5,r11
-	ldr	r11,[sp,#16+4]	@ c.hi
-	and	r5,r5,r12
-	and	r12,r6,r10
-	orr	r6,r6,r10
-	orr	r5,r5,r9		@ Maj(a,b,c).lo
-	and	r6,r6,r11
-	adds	r5,r5,r3
-	orr	r6,r6,r12		@ Maj(a,b,c).hi
-	sub	sp,sp,#8
-	adc	r6,r6,r4		@ h += T
-	tst	r14,#1
-	add	r14,r14,#8
-#if __ARM_ARCH>=7
-	ittt	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	ldreq	r9,[sp,#184+0]
-	ldreq	r10,[sp,#184+4]
-	beq	L16_79
-	bic	r14,r14,#1
-
-	ldr	r3,[sp,#8+0]
-	ldr	r4,[sp,#8+4]
-	ldr	r9, [r0,#0+LO]
-	ldr	r10, [r0,#0+HI]
-	ldr	r11, [r0,#8+LO]
-	ldr	r12, [r0,#8+HI]
-	adds	r9,r5,r9
-	str	r9, [r0,#0+LO]
-	adc	r10,r6,r10
-	str	r10, [r0,#0+HI]
-	adds	r11,r3,r11
-	str	r11, [r0,#8+LO]
-	adc	r12,r4,r12
-	str	r12, [r0,#8+HI]
-
-	ldr	r5,[sp,#16+0]
-	ldr	r6,[sp,#16+4]
-	ldr	r3,[sp,#24+0]
-	ldr	r4,[sp,#24+4]
-	ldr	r9, [r0,#16+LO]
-	ldr	r10, [r0,#16+HI]
-	ldr	r11, [r0,#24+LO]
-	ldr	r12, [r0,#24+HI]
-	adds	r9,r5,r9
-	str	r9, [r0,#16+LO]
-	adc	r10,r6,r10
-	str	r10, [r0,#16+HI]
-	adds	r11,r3,r11
-	str	r11, [r0,#24+LO]
-	adc	r12,r4,r12
-	str	r12, [r0,#24+HI]
-
-	ldr	r3,[sp,#40+0]
-	ldr	r4,[sp,#40+4]
-	ldr	r9, [r0,#32+LO]
-	ldr	r10, [r0,#32+HI]
-	ldr	r11, [r0,#40+LO]
-	ldr	r12, [r0,#40+HI]
-	adds	r7,r7,r9
-	str	r7,[r0,#32+LO]
-	adc	r8,r8,r10
-	str	r8,[r0,#32+HI]
-	adds	r11,r3,r11
-	str	r11, [r0,#40+LO]
-	adc	r12,r4,r12
-	str	r12, [r0,#40+HI]
-
-	ldr	r5,[sp,#48+0]
-	ldr	r6,[sp,#48+4]
-	ldr	r3,[sp,#56+0]
-	ldr	r4,[sp,#56+4]
-	ldr	r9, [r0,#48+LO]
-	ldr	r10, [r0,#48+HI]
-	ldr	r11, [r0,#56+LO]
-	ldr	r12, [r0,#56+HI]
-	adds	r9,r5,r9
-	str	r9, [r0,#48+LO]
-	adc	r10,r6,r10
-	str	r10, [r0,#48+HI]
-	adds	r11,r3,r11
-	str	r11, [r0,#56+LO]
-	adc	r12,r4,r12
-	str	r12, [r0,#56+HI]
-
-	add	sp,sp,#640
-	sub	r14,r14,#640
-
-	teq	r1,r2
-	bne	Loop
-
-	add	sp,sp,#8*9		@ destroy frame
-#if __ARM_ARCH>=5
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-#else
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl	_sha512_block_data_order_neon
-.private_extern	_sha512_block_data_order_neon
-#ifdef __thumb2__
-.thumb_func	_sha512_block_data_order_neon
-#endif
-.align	4
-_sha512_block_data_order_neon:
-	dmb	@ errata #451034 on early Cortex A8
-	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
-	adr	r3,K512
-	VFP_ABI_PUSH
-	vldmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}		@ load context
-Loop_neon:
-	vshr.u64	d24,d20,#14	@ 0
-#if 0<16
-	vld1.64	{d0},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d20,#18
-#if 0>0
-	vadd.i64	d16,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d20,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d20,#50
-	vsli.64	d25,d20,#46
-	vmov	d29,d20
-	vsli.64	d26,d20,#23
-#if 0<16 && defined(__ARMEL__)
-	vrev64.8	d0,d0
-#endif
-	veor	d25,d24
-	vbsl	d29,d21,d22		@ Ch(e,f,g)
-	vshr.u64	d24,d16,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d23
-	vshr.u64	d25,d16,#34
-	vsli.64	d24,d16,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d16,#39
-	vadd.i64	d28,d0
-	vsli.64	d25,d16,#30
-	veor	d30,d16,d17
-	vsli.64	d26,d16,#25
-	veor	d23,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d18,d17		@ Maj(a,b,c)
-	veor	d23,d26			@ Sigma0(a)
-	vadd.i64	d19,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d23,d30
-	vshr.u64	d24,d19,#14	@ 1
-#if 1<16
-	vld1.64	{d1},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d19,#18
-#if 1>0
-	vadd.i64	d23,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d19,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d19,#50
-	vsli.64	d25,d19,#46
-	vmov	d29,d19
-	vsli.64	d26,d19,#23
-#if 1<16 && defined(__ARMEL__)
-	vrev64.8	d1,d1
-#endif
-	veor	d25,d24
-	vbsl	d29,d20,d21		@ Ch(e,f,g)
-	vshr.u64	d24,d23,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d22
-	vshr.u64	d25,d23,#34
-	vsli.64	d24,d23,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d23,#39
-	vadd.i64	d28,d1
-	vsli.64	d25,d23,#30
-	veor	d30,d23,d16
-	vsli.64	d26,d23,#25
-	veor	d22,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d17,d16		@ Maj(a,b,c)
-	veor	d22,d26			@ Sigma0(a)
-	vadd.i64	d18,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d22,d30
-	vshr.u64	d24,d18,#14	@ 2
-#if 2<16
-	vld1.64	{d2},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d18,#18
-#if 2>0
-	vadd.i64	d22,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d18,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d18,#50
-	vsli.64	d25,d18,#46
-	vmov	d29,d18
-	vsli.64	d26,d18,#23
-#if 2<16 && defined(__ARMEL__)
-	vrev64.8	d2,d2
-#endif
-	veor	d25,d24
-	vbsl	d29,d19,d20		@ Ch(e,f,g)
-	vshr.u64	d24,d22,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d21
-	vshr.u64	d25,d22,#34
-	vsli.64	d24,d22,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d22,#39
-	vadd.i64	d28,d2
-	vsli.64	d25,d22,#30
-	veor	d30,d22,d23
-	vsli.64	d26,d22,#25
-	veor	d21,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d16,d23		@ Maj(a,b,c)
-	veor	d21,d26			@ Sigma0(a)
-	vadd.i64	d17,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d21,d30
-	vshr.u64	d24,d17,#14	@ 3
-#if 3<16
-	vld1.64	{d3},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d17,#18
-#if 3>0
-	vadd.i64	d21,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d17,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d17,#50
-	vsli.64	d25,d17,#46
-	vmov	d29,d17
-	vsli.64	d26,d17,#23
-#if 3<16 && defined(__ARMEL__)
-	vrev64.8	d3,d3
-#endif
-	veor	d25,d24
-	vbsl	d29,d18,d19		@ Ch(e,f,g)
-	vshr.u64	d24,d21,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d20
-	vshr.u64	d25,d21,#34
-	vsli.64	d24,d21,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d21,#39
-	vadd.i64	d28,d3
-	vsli.64	d25,d21,#30
-	veor	d30,d21,d22
-	vsli.64	d26,d21,#25
-	veor	d20,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d23,d22		@ Maj(a,b,c)
-	veor	d20,d26			@ Sigma0(a)
-	vadd.i64	d16,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d20,d30
-	vshr.u64	d24,d16,#14	@ 4
-#if 4<16
-	vld1.64	{d4},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d16,#18
-#if 4>0
-	vadd.i64	d20,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d16,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d16,#50
-	vsli.64	d25,d16,#46
-	vmov	d29,d16
-	vsli.64	d26,d16,#23
-#if 4<16 && defined(__ARMEL__)
-	vrev64.8	d4,d4
-#endif
-	veor	d25,d24
-	vbsl	d29,d17,d18		@ Ch(e,f,g)
-	vshr.u64	d24,d20,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d19
-	vshr.u64	d25,d20,#34
-	vsli.64	d24,d20,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d20,#39
-	vadd.i64	d28,d4
-	vsli.64	d25,d20,#30
-	veor	d30,d20,d21
-	vsli.64	d26,d20,#25
-	veor	d19,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d22,d21		@ Maj(a,b,c)
-	veor	d19,d26			@ Sigma0(a)
-	vadd.i64	d23,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d19,d30
-	vshr.u64	d24,d23,#14	@ 5
-#if 5<16
-	vld1.64	{d5},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d23,#18
-#if 5>0
-	vadd.i64	d19,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d23,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d23,#50
-	vsli.64	d25,d23,#46
-	vmov	d29,d23
-	vsli.64	d26,d23,#23
-#if 5<16 && defined(__ARMEL__)
-	vrev64.8	d5,d5
-#endif
-	veor	d25,d24
-	vbsl	d29,d16,d17		@ Ch(e,f,g)
-	vshr.u64	d24,d19,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d18
-	vshr.u64	d25,d19,#34
-	vsli.64	d24,d19,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d19,#39
-	vadd.i64	d28,d5
-	vsli.64	d25,d19,#30
-	veor	d30,d19,d20
-	vsli.64	d26,d19,#25
-	veor	d18,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d21,d20		@ Maj(a,b,c)
-	veor	d18,d26			@ Sigma0(a)
-	vadd.i64	d22,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d18,d30
-	vshr.u64	d24,d22,#14	@ 6
-#if 6<16
-	vld1.64	{d6},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d22,#18
-#if 6>0
-	vadd.i64	d18,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d22,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d22,#50
-	vsli.64	d25,d22,#46
-	vmov	d29,d22
-	vsli.64	d26,d22,#23
-#if 6<16 && defined(__ARMEL__)
-	vrev64.8	d6,d6
-#endif
-	veor	d25,d24
-	vbsl	d29,d23,d16		@ Ch(e,f,g)
-	vshr.u64	d24,d18,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d17
-	vshr.u64	d25,d18,#34
-	vsli.64	d24,d18,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d18,#39
-	vadd.i64	d28,d6
-	vsli.64	d25,d18,#30
-	veor	d30,d18,d19
-	vsli.64	d26,d18,#25
-	veor	d17,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d20,d19		@ Maj(a,b,c)
-	veor	d17,d26			@ Sigma0(a)
-	vadd.i64	d21,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d17,d30
-	vshr.u64	d24,d21,#14	@ 7
-#if 7<16
-	vld1.64	{d7},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d21,#18
-#if 7>0
-	vadd.i64	d17,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d21,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d21,#50
-	vsli.64	d25,d21,#46
-	vmov	d29,d21
-	vsli.64	d26,d21,#23
-#if 7<16 && defined(__ARMEL__)
-	vrev64.8	d7,d7
-#endif
-	veor	d25,d24
-	vbsl	d29,d22,d23		@ Ch(e,f,g)
-	vshr.u64	d24,d17,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d16
-	vshr.u64	d25,d17,#34
-	vsli.64	d24,d17,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d17,#39
-	vadd.i64	d28,d7
-	vsli.64	d25,d17,#30
-	veor	d30,d17,d18
-	vsli.64	d26,d17,#25
-	veor	d16,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d19,d18		@ Maj(a,b,c)
-	veor	d16,d26			@ Sigma0(a)
-	vadd.i64	d20,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d16,d30
-	vshr.u64	d24,d20,#14	@ 8
-#if 8<16
-	vld1.64	{d8},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d20,#18
-#if 8>0
-	vadd.i64	d16,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d20,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d20,#50
-	vsli.64	d25,d20,#46
-	vmov	d29,d20
-	vsli.64	d26,d20,#23
-#if 8<16 && defined(__ARMEL__)
-	vrev64.8	d8,d8
-#endif
-	veor	d25,d24
-	vbsl	d29,d21,d22		@ Ch(e,f,g)
-	vshr.u64	d24,d16,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d23
-	vshr.u64	d25,d16,#34
-	vsli.64	d24,d16,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d16,#39
-	vadd.i64	d28,d8
-	vsli.64	d25,d16,#30
-	veor	d30,d16,d17
-	vsli.64	d26,d16,#25
-	veor	d23,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d18,d17		@ Maj(a,b,c)
-	veor	d23,d26			@ Sigma0(a)
-	vadd.i64	d19,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d23,d30
-	vshr.u64	d24,d19,#14	@ 9
-#if 9<16
-	vld1.64	{d9},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d19,#18
-#if 9>0
-	vadd.i64	d23,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d19,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d19,#50
-	vsli.64	d25,d19,#46
-	vmov	d29,d19
-	vsli.64	d26,d19,#23
-#if 9<16 && defined(__ARMEL__)
-	vrev64.8	d9,d9
-#endif
-	veor	d25,d24
-	vbsl	d29,d20,d21		@ Ch(e,f,g)
-	vshr.u64	d24,d23,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d22
-	vshr.u64	d25,d23,#34
-	vsli.64	d24,d23,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d23,#39
-	vadd.i64	d28,d9
-	vsli.64	d25,d23,#30
-	veor	d30,d23,d16
-	vsli.64	d26,d23,#25
-	veor	d22,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d17,d16		@ Maj(a,b,c)
-	veor	d22,d26			@ Sigma0(a)
-	vadd.i64	d18,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d22,d30
-	vshr.u64	d24,d18,#14	@ 10
-#if 10<16
-	vld1.64	{d10},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d18,#18
-#if 10>0
-	vadd.i64	d22,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d18,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d18,#50
-	vsli.64	d25,d18,#46
-	vmov	d29,d18
-	vsli.64	d26,d18,#23
-#if 10<16 && defined(__ARMEL__)
-	vrev64.8	d10,d10
-#endif
-	veor	d25,d24
-	vbsl	d29,d19,d20		@ Ch(e,f,g)
-	vshr.u64	d24,d22,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d21
-	vshr.u64	d25,d22,#34
-	vsli.64	d24,d22,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d22,#39
-	vadd.i64	d28,d10
-	vsli.64	d25,d22,#30
-	veor	d30,d22,d23
-	vsli.64	d26,d22,#25
-	veor	d21,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d16,d23		@ Maj(a,b,c)
-	veor	d21,d26			@ Sigma0(a)
-	vadd.i64	d17,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d21,d30
-	vshr.u64	d24,d17,#14	@ 11
-#if 11<16
-	vld1.64	{d11},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d17,#18
-#if 11>0
-	vadd.i64	d21,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d17,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d17,#50
-	vsli.64	d25,d17,#46
-	vmov	d29,d17
-	vsli.64	d26,d17,#23
-#if 11<16 && defined(__ARMEL__)
-	vrev64.8	d11,d11
-#endif
-	veor	d25,d24
-	vbsl	d29,d18,d19		@ Ch(e,f,g)
-	vshr.u64	d24,d21,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d20
-	vshr.u64	d25,d21,#34
-	vsli.64	d24,d21,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d21,#39
-	vadd.i64	d28,d11
-	vsli.64	d25,d21,#30
-	veor	d30,d21,d22
-	vsli.64	d26,d21,#25
-	veor	d20,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d23,d22		@ Maj(a,b,c)
-	veor	d20,d26			@ Sigma0(a)
-	vadd.i64	d16,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d20,d30
-	vshr.u64	d24,d16,#14	@ 12
-#if 12<16
-	vld1.64	{d12},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d16,#18
-#if 12>0
-	vadd.i64	d20,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d16,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d16,#50
-	vsli.64	d25,d16,#46
-	vmov	d29,d16
-	vsli.64	d26,d16,#23
-#if 12<16 && defined(__ARMEL__)
-	vrev64.8	d12,d12
-#endif
-	veor	d25,d24
-	vbsl	d29,d17,d18		@ Ch(e,f,g)
-	vshr.u64	d24,d20,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d19
-	vshr.u64	d25,d20,#34
-	vsli.64	d24,d20,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d20,#39
-	vadd.i64	d28,d12
-	vsli.64	d25,d20,#30
-	veor	d30,d20,d21
-	vsli.64	d26,d20,#25
-	veor	d19,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d22,d21		@ Maj(a,b,c)
-	veor	d19,d26			@ Sigma0(a)
-	vadd.i64	d23,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d19,d30
-	vshr.u64	d24,d23,#14	@ 13
-#if 13<16
-	vld1.64	{d13},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d23,#18
-#if 13>0
-	vadd.i64	d19,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d23,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d23,#50
-	vsli.64	d25,d23,#46
-	vmov	d29,d23
-	vsli.64	d26,d23,#23
-#if 13<16 && defined(__ARMEL__)
-	vrev64.8	d13,d13
-#endif
-	veor	d25,d24
-	vbsl	d29,d16,d17		@ Ch(e,f,g)
-	vshr.u64	d24,d19,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d18
-	vshr.u64	d25,d19,#34
-	vsli.64	d24,d19,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d19,#39
-	vadd.i64	d28,d13
-	vsli.64	d25,d19,#30
-	veor	d30,d19,d20
-	vsli.64	d26,d19,#25
-	veor	d18,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d21,d20		@ Maj(a,b,c)
-	veor	d18,d26			@ Sigma0(a)
-	vadd.i64	d22,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d18,d30
-	vshr.u64	d24,d22,#14	@ 14
-#if 14<16
-	vld1.64	{d14},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d22,#18
-#if 14>0
-	vadd.i64	d18,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d22,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d22,#50
-	vsli.64	d25,d22,#46
-	vmov	d29,d22
-	vsli.64	d26,d22,#23
-#if 14<16 && defined(__ARMEL__)
-	vrev64.8	d14,d14
-#endif
-	veor	d25,d24
-	vbsl	d29,d23,d16		@ Ch(e,f,g)
-	vshr.u64	d24,d18,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d17
-	vshr.u64	d25,d18,#34
-	vsli.64	d24,d18,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d18,#39
-	vadd.i64	d28,d14
-	vsli.64	d25,d18,#30
-	veor	d30,d18,d19
-	vsli.64	d26,d18,#25
-	veor	d17,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d20,d19		@ Maj(a,b,c)
-	veor	d17,d26			@ Sigma0(a)
-	vadd.i64	d21,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d17,d30
-	vshr.u64	d24,d21,#14	@ 15
-#if 15<16
-	vld1.64	{d15},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d21,#18
-#if 15>0
-	vadd.i64	d17,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d21,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d21,#50
-	vsli.64	d25,d21,#46
-	vmov	d29,d21
-	vsli.64	d26,d21,#23
-#if 15<16 && defined(__ARMEL__)
-	vrev64.8	d15,d15
-#endif
-	veor	d25,d24
-	vbsl	d29,d22,d23		@ Ch(e,f,g)
-	vshr.u64	d24,d17,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d16
-	vshr.u64	d25,d17,#34
-	vsli.64	d24,d17,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d17,#39
-	vadd.i64	d28,d15
-	vsli.64	d25,d17,#30
-	veor	d30,d17,d18
-	vsli.64	d26,d17,#25
-	veor	d16,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d19,d18		@ Maj(a,b,c)
-	veor	d16,d26			@ Sigma0(a)
-	vadd.i64	d20,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d16,d30
-	mov	r12,#4
-L16_79_neon:
-	subs	r12,#1
-	vshr.u64	q12,q7,#19
-	vshr.u64	q13,q7,#61
-	vadd.i64	d16,d30			@ h+=Maj from the past
-	vshr.u64	q15,q7,#6
-	vsli.64	q12,q7,#45
-	vext.8	q14,q0,q1,#8	@ X[i+1]
-	vsli.64	q13,q7,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q0,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q4,q5,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d20,#14		@ from NEON_00_15
-	vadd.i64	q0,q14
-	vshr.u64	d25,d20,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d20,#41		@ from NEON_00_15
-	vadd.i64	q0,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d20,#50
-	vsli.64	d25,d20,#46
-	vmov	d29,d20
-	vsli.64	d26,d20,#23
-#if 16<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d21,d22		@ Ch(e,f,g)
-	vshr.u64	d24,d16,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d23
-	vshr.u64	d25,d16,#34
-	vsli.64	d24,d16,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d16,#39
-	vadd.i64	d28,d0
-	vsli.64	d25,d16,#30
-	veor	d30,d16,d17
-	vsli.64	d26,d16,#25
-	veor	d23,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d18,d17		@ Maj(a,b,c)
-	veor	d23,d26			@ Sigma0(a)
-	vadd.i64	d19,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d23,d30
-	vshr.u64	d24,d19,#14	@ 17
-#if 17<16
-	vld1.64	{d1},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d19,#18
-#if 17>0
-	vadd.i64	d23,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d19,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d19,#50
-	vsli.64	d25,d19,#46
-	vmov	d29,d19
-	vsli.64	d26,d19,#23
-#if 17<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d20,d21		@ Ch(e,f,g)
-	vshr.u64	d24,d23,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d22
-	vshr.u64	d25,d23,#34
-	vsli.64	d24,d23,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d23,#39
-	vadd.i64	d28,d1
-	vsli.64	d25,d23,#30
-	veor	d30,d23,d16
-	vsli.64	d26,d23,#25
-	veor	d22,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d17,d16		@ Maj(a,b,c)
-	veor	d22,d26			@ Sigma0(a)
-	vadd.i64	d18,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d22,d30
-	vshr.u64	q12,q0,#19
-	vshr.u64	q13,q0,#61
-	vadd.i64	d22,d30			@ h+=Maj from the past
-	vshr.u64	q15,q0,#6
-	vsli.64	q12,q0,#45
-	vext.8	q14,q1,q2,#8	@ X[i+1]
-	vsli.64	q13,q0,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q1,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q5,q6,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d18,#14		@ from NEON_00_15
-	vadd.i64	q1,q14
-	vshr.u64	d25,d18,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d18,#41		@ from NEON_00_15
-	vadd.i64	q1,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d18,#50
-	vsli.64	d25,d18,#46
-	vmov	d29,d18
-	vsli.64	d26,d18,#23
-#if 18<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d19,d20		@ Ch(e,f,g)
-	vshr.u64	d24,d22,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d21
-	vshr.u64	d25,d22,#34
-	vsli.64	d24,d22,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d22,#39
-	vadd.i64	d28,d2
-	vsli.64	d25,d22,#30
-	veor	d30,d22,d23
-	vsli.64	d26,d22,#25
-	veor	d21,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d16,d23		@ Maj(a,b,c)
-	veor	d21,d26			@ Sigma0(a)
-	vadd.i64	d17,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d21,d30
-	vshr.u64	d24,d17,#14	@ 19
-#if 19<16
-	vld1.64	{d3},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d17,#18
-#if 19>0
-	vadd.i64	d21,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d17,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d17,#50
-	vsli.64	d25,d17,#46
-	vmov	d29,d17
-	vsli.64	d26,d17,#23
-#if 19<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d18,d19		@ Ch(e,f,g)
-	vshr.u64	d24,d21,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d20
-	vshr.u64	d25,d21,#34
-	vsli.64	d24,d21,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d21,#39
-	vadd.i64	d28,d3
-	vsli.64	d25,d21,#30
-	veor	d30,d21,d22
-	vsli.64	d26,d21,#25
-	veor	d20,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d23,d22		@ Maj(a,b,c)
-	veor	d20,d26			@ Sigma0(a)
-	vadd.i64	d16,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d20,d30
-	vshr.u64	q12,q1,#19
-	vshr.u64	q13,q1,#61
-	vadd.i64	d20,d30			@ h+=Maj from the past
-	vshr.u64	q15,q1,#6
-	vsli.64	q12,q1,#45
-	vext.8	q14,q2,q3,#8	@ X[i+1]
-	vsli.64	q13,q1,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q2,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q6,q7,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d16,#14		@ from NEON_00_15
-	vadd.i64	q2,q14
-	vshr.u64	d25,d16,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d16,#41		@ from NEON_00_15
-	vadd.i64	q2,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d16,#50
-	vsli.64	d25,d16,#46
-	vmov	d29,d16
-	vsli.64	d26,d16,#23
-#if 20<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d17,d18		@ Ch(e,f,g)
-	vshr.u64	d24,d20,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d19
-	vshr.u64	d25,d20,#34
-	vsli.64	d24,d20,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d20,#39
-	vadd.i64	d28,d4
-	vsli.64	d25,d20,#30
-	veor	d30,d20,d21
-	vsli.64	d26,d20,#25
-	veor	d19,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d22,d21		@ Maj(a,b,c)
-	veor	d19,d26			@ Sigma0(a)
-	vadd.i64	d23,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d19,d30
-	vshr.u64	d24,d23,#14	@ 21
-#if 21<16
-	vld1.64	{d5},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d23,#18
-#if 21>0
-	vadd.i64	d19,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d23,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d23,#50
-	vsli.64	d25,d23,#46
-	vmov	d29,d23
-	vsli.64	d26,d23,#23
-#if 21<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d16,d17		@ Ch(e,f,g)
-	vshr.u64	d24,d19,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d18
-	vshr.u64	d25,d19,#34
-	vsli.64	d24,d19,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d19,#39
-	vadd.i64	d28,d5
-	vsli.64	d25,d19,#30
-	veor	d30,d19,d20
-	vsli.64	d26,d19,#25
-	veor	d18,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d21,d20		@ Maj(a,b,c)
-	veor	d18,d26			@ Sigma0(a)
-	vadd.i64	d22,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d18,d30
-	vshr.u64	q12,q2,#19
-	vshr.u64	q13,q2,#61
-	vadd.i64	d18,d30			@ h+=Maj from the past
-	vshr.u64	q15,q2,#6
-	vsli.64	q12,q2,#45
-	vext.8	q14,q3,q4,#8	@ X[i+1]
-	vsli.64	q13,q2,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q3,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q7,q0,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d22,#14		@ from NEON_00_15
-	vadd.i64	q3,q14
-	vshr.u64	d25,d22,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d22,#41		@ from NEON_00_15
-	vadd.i64	q3,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d22,#50
-	vsli.64	d25,d22,#46
-	vmov	d29,d22
-	vsli.64	d26,d22,#23
-#if 22<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d23,d16		@ Ch(e,f,g)
-	vshr.u64	d24,d18,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d17
-	vshr.u64	d25,d18,#34
-	vsli.64	d24,d18,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d18,#39
-	vadd.i64	d28,d6
-	vsli.64	d25,d18,#30
-	veor	d30,d18,d19
-	vsli.64	d26,d18,#25
-	veor	d17,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d20,d19		@ Maj(a,b,c)
-	veor	d17,d26			@ Sigma0(a)
-	vadd.i64	d21,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d17,d30
-	vshr.u64	d24,d21,#14	@ 23
-#if 23<16
-	vld1.64	{d7},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d21,#18
-#if 23>0
-	vadd.i64	d17,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d21,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d21,#50
-	vsli.64	d25,d21,#46
-	vmov	d29,d21
-	vsli.64	d26,d21,#23
-#if 23<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d22,d23		@ Ch(e,f,g)
-	vshr.u64	d24,d17,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d16
-	vshr.u64	d25,d17,#34
-	vsli.64	d24,d17,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d17,#39
-	vadd.i64	d28,d7
-	vsli.64	d25,d17,#30
-	veor	d30,d17,d18
-	vsli.64	d26,d17,#25
-	veor	d16,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d19,d18		@ Maj(a,b,c)
-	veor	d16,d26			@ Sigma0(a)
-	vadd.i64	d20,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d16,d30
-	vshr.u64	q12,q3,#19
-	vshr.u64	q13,q3,#61
-	vadd.i64	d16,d30			@ h+=Maj from the past
-	vshr.u64	q15,q3,#6
-	vsli.64	q12,q3,#45
-	vext.8	q14,q4,q5,#8	@ X[i+1]
-	vsli.64	q13,q3,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q4,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q0,q1,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d20,#14		@ from NEON_00_15
-	vadd.i64	q4,q14
-	vshr.u64	d25,d20,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d20,#41		@ from NEON_00_15
-	vadd.i64	q4,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d20,#50
-	vsli.64	d25,d20,#46
-	vmov	d29,d20
-	vsli.64	d26,d20,#23
-#if 24<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d21,d22		@ Ch(e,f,g)
-	vshr.u64	d24,d16,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d23
-	vshr.u64	d25,d16,#34
-	vsli.64	d24,d16,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d16,#39
-	vadd.i64	d28,d8
-	vsli.64	d25,d16,#30
-	veor	d30,d16,d17
-	vsli.64	d26,d16,#25
-	veor	d23,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d18,d17		@ Maj(a,b,c)
-	veor	d23,d26			@ Sigma0(a)
-	vadd.i64	d19,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d23,d30
-	vshr.u64	d24,d19,#14	@ 25
-#if 25<16
-	vld1.64	{d9},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d19,#18
-#if 25>0
-	vadd.i64	d23,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d19,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d19,#50
-	vsli.64	d25,d19,#46
-	vmov	d29,d19
-	vsli.64	d26,d19,#23
-#if 25<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d20,d21		@ Ch(e,f,g)
-	vshr.u64	d24,d23,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d22
-	vshr.u64	d25,d23,#34
-	vsli.64	d24,d23,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d23,#39
-	vadd.i64	d28,d9
-	vsli.64	d25,d23,#30
-	veor	d30,d23,d16
-	vsli.64	d26,d23,#25
-	veor	d22,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d17,d16		@ Maj(a,b,c)
-	veor	d22,d26			@ Sigma0(a)
-	vadd.i64	d18,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d22,d30
-	vshr.u64	q12,q4,#19
-	vshr.u64	q13,q4,#61
-	vadd.i64	d22,d30			@ h+=Maj from the past
-	vshr.u64	q15,q4,#6
-	vsli.64	q12,q4,#45
-	vext.8	q14,q5,q6,#8	@ X[i+1]
-	vsli.64	q13,q4,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q5,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q1,q2,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d18,#14		@ from NEON_00_15
-	vadd.i64	q5,q14
-	vshr.u64	d25,d18,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d18,#41		@ from NEON_00_15
-	vadd.i64	q5,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d18,#50
-	vsli.64	d25,d18,#46
-	vmov	d29,d18
-	vsli.64	d26,d18,#23
-#if 26<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d19,d20		@ Ch(e,f,g)
-	vshr.u64	d24,d22,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d21
-	vshr.u64	d25,d22,#34
-	vsli.64	d24,d22,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d22,#39
-	vadd.i64	d28,d10
-	vsli.64	d25,d22,#30
-	veor	d30,d22,d23
-	vsli.64	d26,d22,#25
-	veor	d21,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d16,d23		@ Maj(a,b,c)
-	veor	d21,d26			@ Sigma0(a)
-	vadd.i64	d17,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d21,d30
-	vshr.u64	d24,d17,#14	@ 27
-#if 27<16
-	vld1.64	{d11},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d17,#18
-#if 27>0
-	vadd.i64	d21,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d17,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d17,#50
-	vsli.64	d25,d17,#46
-	vmov	d29,d17
-	vsli.64	d26,d17,#23
-#if 27<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d18,d19		@ Ch(e,f,g)
-	vshr.u64	d24,d21,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d20
-	vshr.u64	d25,d21,#34
-	vsli.64	d24,d21,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d21,#39
-	vadd.i64	d28,d11
-	vsli.64	d25,d21,#30
-	veor	d30,d21,d22
-	vsli.64	d26,d21,#25
-	veor	d20,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d23,d22		@ Maj(a,b,c)
-	veor	d20,d26			@ Sigma0(a)
-	vadd.i64	d16,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d20,d30
-	vshr.u64	q12,q5,#19
-	vshr.u64	q13,q5,#61
-	vadd.i64	d20,d30			@ h+=Maj from the past
-	vshr.u64	q15,q5,#6
-	vsli.64	q12,q5,#45
-	vext.8	q14,q6,q7,#8	@ X[i+1]
-	vsli.64	q13,q5,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q6,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q2,q3,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d16,#14		@ from NEON_00_15
-	vadd.i64	q6,q14
-	vshr.u64	d25,d16,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d16,#41		@ from NEON_00_15
-	vadd.i64	q6,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d16,#50
-	vsli.64	d25,d16,#46
-	vmov	d29,d16
-	vsli.64	d26,d16,#23
-#if 28<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d17,d18		@ Ch(e,f,g)
-	vshr.u64	d24,d20,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d19
-	vshr.u64	d25,d20,#34
-	vsli.64	d24,d20,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d20,#39
-	vadd.i64	d28,d12
-	vsli.64	d25,d20,#30
-	veor	d30,d20,d21
-	vsli.64	d26,d20,#25
-	veor	d19,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d22,d21		@ Maj(a,b,c)
-	veor	d19,d26			@ Sigma0(a)
-	vadd.i64	d23,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d19,d30
-	vshr.u64	d24,d23,#14	@ 29
-#if 29<16
-	vld1.64	{d13},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d23,#18
-#if 29>0
-	vadd.i64	d19,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d23,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d23,#50
-	vsli.64	d25,d23,#46
-	vmov	d29,d23
-	vsli.64	d26,d23,#23
-#if 29<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d16,d17		@ Ch(e,f,g)
-	vshr.u64	d24,d19,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d18
-	vshr.u64	d25,d19,#34
-	vsli.64	d24,d19,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d19,#39
-	vadd.i64	d28,d13
-	vsli.64	d25,d19,#30
-	veor	d30,d19,d20
-	vsli.64	d26,d19,#25
-	veor	d18,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d21,d20		@ Maj(a,b,c)
-	veor	d18,d26			@ Sigma0(a)
-	vadd.i64	d22,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d18,d30
-	vshr.u64	q12,q6,#19
-	vshr.u64	q13,q6,#61
-	vadd.i64	d18,d30			@ h+=Maj from the past
-	vshr.u64	q15,q6,#6
-	vsli.64	q12,q6,#45
-	vext.8	q14,q7,q0,#8	@ X[i+1]
-	vsli.64	q13,q6,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q7,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q3,q4,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d22,#14		@ from NEON_00_15
-	vadd.i64	q7,q14
-	vshr.u64	d25,d22,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d22,#41		@ from NEON_00_15
-	vadd.i64	q7,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d22,#50
-	vsli.64	d25,d22,#46
-	vmov	d29,d22
-	vsli.64	d26,d22,#23
-#if 30<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d23,d16		@ Ch(e,f,g)
-	vshr.u64	d24,d18,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d17
-	vshr.u64	d25,d18,#34
-	vsli.64	d24,d18,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d18,#39
-	vadd.i64	d28,d14
-	vsli.64	d25,d18,#30
-	veor	d30,d18,d19
-	vsli.64	d26,d18,#25
-	veor	d17,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d20,d19		@ Maj(a,b,c)
-	veor	d17,d26			@ Sigma0(a)
-	vadd.i64	d21,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d17,d30
-	vshr.u64	d24,d21,#14	@ 31
-#if 31<16
-	vld1.64	{d15},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d21,#18
-#if 31>0
-	vadd.i64	d17,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d21,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d21,#50
-	vsli.64	d25,d21,#46
-	vmov	d29,d21
-	vsli.64	d26,d21,#23
-#if 31<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d22,d23		@ Ch(e,f,g)
-	vshr.u64	d24,d17,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d16
-	vshr.u64	d25,d17,#34
-	vsli.64	d24,d17,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d17,#39
-	vadd.i64	d28,d15
-	vsli.64	d25,d17,#30
-	veor	d30,d17,d18
-	vsli.64	d26,d17,#25
-	veor	d16,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d19,d18		@ Maj(a,b,c)
-	veor	d16,d26			@ Sigma0(a)
-	vadd.i64	d20,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d16,d30
-	bne	L16_79_neon
-
-	vadd.i64	d16,d30		@ h+=Maj from the past
-	vldmia	r0,{d24,d25,d26,d27,d28,d29,d30,d31}	@ load context to temp
-	vadd.i64	q8,q12		@ vectorized accumulate
-	vadd.i64	q9,q13
-	vadd.i64	q10,q14
-	vadd.i64	q11,q15
-	vstmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}	@ save context
-	teq	r1,r2
-	sub	r3,#640	@ rewind K512
-	bne	Loop_neon
-
-	VFP_ABI_POP
-	bx	lr				@ .word	0xe12fff1e
-
-#endif
-.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
-#endif  // defined(__arm__) && defined(__APPLE__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c b/Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c.inc
similarity index 97%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c
rename to Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c.inc
index 67c2ec7b..fb8b5473 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c
+++ b/Sources/CCryptoBoringSSL/crypto/fipsmodule/tls/kdf.c.inc
@@ -189,6 +189,7 @@ int CRYPTO_tls13_hkdf_expand_label(uint8_t *out, size_t out_len,
   uint8_t *hkdf_label = NULL;
   size_t hkdf_label_len;
 
+  FIPS_service_indicator_lock_state();
   CBB_zero(&cbb);
   if (!CBB_init(&cbb, 2 + 1 + sizeof(kProtocolLabel) - 1 + label_len + 1 +
                           hash_len) ||
@@ -200,12 +201,18 @@ int CRYPTO_tls13_hkdf_expand_label(uint8_t *out, size_t out_len,
       !CBB_add_bytes(&child, hash, hash_len) ||
       !CBB_finish(&cbb, &hkdf_label, &hkdf_label_len)) {
     CBB_cleanup(&cbb);
+    FIPS_service_indicator_unlock_state();
     return 0;
   }
 
   const int ret = HKDF_expand(out, out_len, digest, secret, secret_len,
                               hkdf_label, hkdf_label_len);
   OPENSSL_free(hkdf_label);
+
+  FIPS_service_indicator_unlock_state();
+  if (ret) {
+    TLSKDF_verify_service_indicator(digest);
+  }
   return ret;
 }
 
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-ios.ios.arm.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-ios.ios.arm.S
deleted file mode 100644
index d0f196ed..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-ios.ios.arm.S
+++ /dev/null
@@ -1,1264 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__APPLE__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-.syntax	unified
-
-
-
-
-#if defined(__thumb2__)
-.thumb
-#else
-.code	32
-#endif
-
-.text
-
-
-.align	7	@ totally strategic alignment
-_vpaes_consts:
-Lk_mc_forward:@ mc_forward
-.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
-.quad	0x080B0A0904070605, 0x000302010C0F0E0D
-.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
-.quad	0x000302010C0F0E0D, 0x080B0A0904070605
-Lk_mc_backward:@ mc_backward
-.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
-.quad	0x020100030E0D0C0F, 0x0A09080B06050407
-.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
-.quad	0x0A09080B06050407, 0x020100030E0D0C0F
-Lk_sr:@ sr
-.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
-.quad	0x030E09040F0A0500, 0x0B06010C07020D08
-.quad	0x0F060D040B020900, 0x070E050C030A0108
-.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
-
-@
-@ "Hot" constants
-@
-Lk_inv:@ inv, inva
-.quad	0x0E05060F0D080180, 0x040703090A0B0C02
-.quad	0x01040A060F0B0780, 0x030D0E0C02050809
-Lk_ipt:@ input transform (lo, hi)
-.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
-.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
-Lk_sbo:@ sbou, sbot
-.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
-.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
-Lk_sb1:@ sb1u, sb1t
-.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
-.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
-Lk_sb2:@ sb2u, sb2t
-.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
-.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
-
-.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
-.align	2
-
-.align	6
-@@
-@@  _aes_preheat
-@@
-@@  Fills q9-q15 as specified below.
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_preheat
-#endif
-.align	4
-_vpaes_preheat:
-	adr	r10, Lk_inv
-	vmov.i8	q9, #0x0f		@ Lk_s0F
-	vld1.64	{q10,q11}, [r10]!	@ Lk_inv
-	add	r10, r10, #64		@ Skip Lk_ipt, Lk_sbo
-	vld1.64	{q12,q13}, [r10]!	@ Lk_sb1
-	vld1.64	{q14,q15}, [r10]	@ Lk_sb2
-	bx	lr
-
-@@
-@@  _aes_encrypt_core
-@@
-@@  AES-encrypt q0.
-@@
-@@  Inputs:
-@@     q0 = input
-@@     q9-q15 as in _vpaes_preheat
-@@    [r2] = scheduled keys
-@@
-@@  Output in q0
-@@  Clobbers  q1-q5, r8-r11
-@@  Preserves q6-q8 so you get some local vectors
-@@
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_encrypt_core
-#endif
-.align	4
-_vpaes_encrypt_core:
-	mov	r9, r2
-	ldr	r8, [r2,#240]		@ pull rounds
-	adr	r11, Lk_ipt
-	@ vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
-	@ vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
-	vld1.64	{q2, q3}, [r11]
-	adr	r11, Lk_mc_forward+16
-	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5		# round0 key
-	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
-	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
-	vtbl.8	d2, {q2}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm1
-	vtbl.8	d3, {q2}, d3
-	vtbl.8	d4, {q3}, d0	@ vpshufb	%xmm0,	%xmm3,	%xmm2
-	vtbl.8	d5, {q3}, d1
-	veor	q0, q1, q5		@ vpxor	%xmm5,	%xmm1,	%xmm0
-	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
-
-	@ .Lenc_entry ends with a bnz instruction which is normally paired with
-	@ subs in .Lenc_loop.
-	tst	r8, r8
-	b	Lenc_entry
-
-.align	4
-Lenc_loop:
-	@ middle of middle round
-	add	r10, r11, #0x40
-	vtbl.8	d8, {q13}, d4	@ vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
-	vtbl.8	d9, {q13}, d5
-	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
-	vtbl.8	d0, {q12}, d6	@ vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
-	vtbl.8	d1, {q12}, d7
-	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	vtbl.8	d10, {q15}, d4	@ vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
-	vtbl.8	d11, {q15}, d5
-	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
-	vtbl.8	d4, {q14}, d6	@ vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
-	vtbl.8	d5, {q14}, d7
-	vld1.64	{q4}, [r10]		@ vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
-	vtbl.8	d6, {q0}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
-	vtbl.8	d7, {q0}, d3
-	veor	q2, q2, q5		@ vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
-	@ Write to q5 instead of q0, so the table and destination registers do
-	@ not overlap.
-	vtbl.8	d10, {q0}, d8	@ vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
-	vtbl.8	d11, {q0}, d9
-	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
-	vtbl.8	d8, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
-	vtbl.8	d9, {q3}, d3
-	@ Here we restore the original q0/q5 usage.
-	veor	q0, q5, q3		@ vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
-	and	r11, r11, #~(1<<6)	@ and		$0x30,	%r11		# ... mod 4
-	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
-	subs	r8, r8, #1		@ nr--
-
-Lenc_entry:
-	@ top of round
-	vand	q1, q0, q9		@ vpand		%xmm0,	%xmm9,	%xmm1   # 0 = k
-	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	vtbl.8	d10, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
-	vtbl.8	d11, {q11}, d3
-	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
-	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
-	vtbl.8	d7, {q10}, d1
-	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
-	vtbl.8	d9, {q10}, d3
-	veor	q3, q3, q5		@ vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
-	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
-	vtbl.8	d5, {q10}, d7
-	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
-	vtbl.8	d7, {q10}, d9
-	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
-	veor	q3, q3, q0		@ vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
-	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5
-	bne	Lenc_loop
-
-	@ middle of last round
-	add	r10, r11, #0x80
-
-	adr	r11, Lk_sbo
-	@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
-	@ overlap table and destination registers.
-	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou
-	vld1.64	{q0}, [r11]		@ vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	Lk_sbo+16
-	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-	vtbl.8	d9, {q1}, d5
-	vld1.64	{q1}, [r10]		@ vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
-	@ Write to q2 instead of q0 below, to avoid overlapping table and
-	@ destination registers.
-	vtbl.8	d4, {q0}, d6	@ vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
-	vtbl.8	d5, {q0}, d7
-	veor	q4, q4, q5		@ vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	veor	q2, q2, q4		@ vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
-	@ Here we restore the original q0/q2 usage.
-	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0
-	vtbl.8	d1, {q2}, d3
-	bx	lr
-
-
-.globl	_vpaes_encrypt
-.private_extern	_vpaes_encrypt
-#ifdef __thumb2__
-.thumb_func	_vpaes_encrypt
-#endif
-.align	4
-_vpaes_encrypt:
-	@ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
-	@ alignment.
-	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
-	@ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
-	vstmdb	sp!, {d8,d9,d10,d11}
-
-	vld1.64	{q0}, [r0]
-	bl	_vpaes_preheat
-	bl	_vpaes_encrypt_core
-	vst1.64	{q0}, [r1]
-
-	vldmia	sp!, {d8,d9,d10,d11}
-	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
-
-
-@
-@  Decryption stuff
-@
-
-.align	4
-_vpaes_decrypt_consts:
-Lk_dipt:@ decryption input transform
-.quad	0x0F505B040B545F00, 0x154A411E114E451A
-.quad	0x86E383E660056500, 0x12771772F491F194
-Lk_dsbo:@ decryption sbox final output
-.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
-.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-Lk_dsb9:@ decryption sbox output *9*u, *9*t
-.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
-.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
-Lk_dsbd:@ decryption sbox output *D*u, *D*t
-.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
-.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
-Lk_dsbb:@ decryption sbox output *B*u, *B*t
-.quad	0xD022649296B44200, 0x602646F6B0F2D404
-.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
-Lk_dsbe:@ decryption sbox output *E*u, *E*t
-.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
-.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
-
-
-@@
-@@  Decryption core
-@@
-@@  Same API as encryption core, except it clobbers q12-q15 rather than using
-@@  the values from _vpaes_preheat. q9-q11 must still be set from
-@@  _vpaes_preheat.
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_decrypt_core
-#endif
-.align	4
-_vpaes_decrypt_core:
-	mov	r9, r2
-	ldr	r8, [r2,#240]		@ pull rounds
-
-	@ This function performs shuffles with various constants. The x86_64
-	@ version loads them on-demand into %xmm0-%xmm5. This does not work well
-	@ for ARMv7 because those registers are shuffle destinations. The ARMv8
-	@ version preloads those constants into registers, but ARMv7 has half
-	@ the registers to work with. Instead, we load them on-demand into
-	@ q12-q15, registers normally use for preloaded constants. This is fine
-	@ because decryption doesn't use those constants. The values are
-	@ constant, so this does not interfere with potential 2x optimizations.
-	adr	r7, Lk_dipt
-
-	vld1.64	{q12,q13}, [r7]		@ vmovdqa	Lk_dipt(%rip), %xmm2	# iptlo
-	lsl	r11, r8, #4		@ mov		%rax,	%r11;	shl	$4, %r11
-	eor	r11, r11, #0x30		@ xor		$0x30,	%r11
-	adr	r10, Lk_sr
-	and	r11, r11, #0x30		@ and		$0x30,	%r11
-	add	r11, r11, r10
-	adr	r10, Lk_mc_forward+48
-
-	vld1.64	{q4}, [r9]!		@ vmovdqu	(%r9),	%xmm4		# round0 key
-	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1
-	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
-	vtbl.8	d4, {q12}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
-	vtbl.8	d5, {q12}, d3
-	vld1.64	{q5}, [r10]		@ vmovdqa	Lk_mc_forward+48(%rip), %xmm5
-					@ vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
-	vtbl.8	d0, {q13}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
-	vtbl.8	d1, {q13}, d1
-	veor	q2, q2, q4		@ vpxor		%xmm4,	%xmm2,	%xmm2
-	veor	q0, q0, q2		@ vpxor		%xmm2,	%xmm0,	%xmm0
-
-	@ .Ldec_entry ends with a bnz instruction which is normally paired with
-	@ subs in .Ldec_loop.
-	tst	r8, r8
-	b	Ldec_entry
-
-.align	4
-Ldec_loop:
-@
-@  Inverse mix columns
-@
-
-	@ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
-	@ the function.
-	adr	r10, Lk_dsb9
-	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
-					@ vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
-	@ Load sbd* ahead of time.
-	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
-					@ vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
-	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
-	vtbl.8	d9, {q12}, d5
-	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
-	vtbl.8	d3, {q13}, d7
-	veor	q0, q4, q0		@ vpxor		%xmm4,	%xmm0,	%xmm0
-
-	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
-
-	@ Load sbb* ahead of time.
-	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	0x20(%r10),%xmm4		# 4 : sbbu
-					@ vmovdqa	0x30(%r10),%xmm1		# 0 : sbbt
-
-	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
-	vtbl.8	d9, {q14}, d5
-	@ Write to q1 instead of q0, so the table and destination registers do
-	@ not overlap.
-	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	vtbl.8	d3, {q0}, d11
-	@ Here we restore the original q0/q1 usage. This instruction is
-	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
-	@ below.
-	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
-	vtbl.8	d3, {q15}, d7
-					@ vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
-	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
-					@ vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
-
-	@ Load sbd* ahead of time.
-	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x40(%r10),%xmm4		# 4 : sbeu
-					@ vmovdqa	0x50(%r10),%xmm1		# 0 : sbet
-
-	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
-	vtbl.8	d9, {q12}, d5
-	@ Write to q1 instead of q0, so the table and destination registers do
-	@ not overlap.
-	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	vtbl.8	d3, {q0}, d11
-	@ Here we restore the original q0/q1 usage. This instruction is
-	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
-	@ below.
-	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
-	vtbl.8	d3, {q13}, d7
-	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
-
-	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
-	vtbl.8	d9, {q14}, d5
-	@ Write to q1 instead of q0, so the table and destination registers do
-	@ not overlap.
-	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	vtbl.8	d3, {q0}, d11
-	@ Here we restore the original q0/q1 usage. This instruction is
-	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
-	@ below.
-	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
-	vtbl.8	d3, {q15}, d7
-	vext.8	q5, q5, q5, #12		@ vpalignr 	$12,	%xmm5,	%xmm5,	%xmm5
-	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	subs	r8, r8, #1		@ sub		$1,%rax			# nr--
-
-Ldec_entry:
-	@ top of round
-	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1	# 0 = k
-	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	vtbl.8	d4, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
-	vtbl.8	d5, {q11}, d3
-	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
-	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
-	vtbl.8	d7, {q10}, d1
-	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
-	vtbl.8	d9, {q10}, d3
-	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	veor	q4, q4, q2		@ vpxor		%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
-	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
-	vtbl.8	d5, {q10}, d7
-	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
-	vtbl.8	d7, {q10}, d9
-	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2	# 2 = io
-	veor	q3, q3, q0		@ vpxor		%xmm0,  %xmm3,	%xmm3	# 3 = jo
-	vld1.64	{q0}, [r9]!		@ vmovdqu	(%r9),	%xmm0
-	bne	Ldec_loop
-
-	@ middle of last round
-
-	adr	r10, Lk_dsbo
-
-	@ Write to q1 rather than q4 to avoid overlapping table and destination.
-	vld1.64	{q1}, [r10]!		@ vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
-	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-	vtbl.8	d9, {q1}, d5
-	@ Write to q2 rather than q1 to avoid overlapping table and destination.
-	vld1.64	{q2}, [r10]		@ vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
-	vtbl.8	d2, {q2}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
-	vtbl.8	d3, {q2}, d7
-	vld1.64	{q2}, [r11]		@ vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
-	veor	q4, q4, q0		@ vpxor		%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
-	@ Write to q1 rather than q0 so the table and destination registers
-	@ below do not overlap.
-	veor	q1, q1, q4		@ vpxor		%xmm4,	%xmm1,	%xmm0	# 0 = A
-	vtbl.8	d0, {q1}, d4	@ vpshufb	%xmm2,	%xmm0,	%xmm0
-	vtbl.8	d1, {q1}, d5
-	bx	lr
-
-
-.globl	_vpaes_decrypt
-.private_extern	_vpaes_decrypt
-#ifdef __thumb2__
-.thumb_func	_vpaes_decrypt
-#endif
-.align	4
-_vpaes_decrypt:
-	@ _vpaes_decrypt_core uses r7-r11.
-	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
-	@ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
-	vstmdb	sp!, {d8,d9,d10,d11}
-
-	vld1.64	{q0}, [r0]
-	bl	_vpaes_preheat
-	bl	_vpaes_decrypt_core
-	vst1.64	{q0}, [r1]
-
-	vldmia	sp!, {d8,d9,d10,d11}
-	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
-
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-@@                                                    @@
-@@                  AES key schedule                  @@
-@@                                                    @@
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
-@ This function diverges from both x86_64 and armv7 in which constants are
-@ pinned. x86_64 has a common preheat function for all operations. aarch64
-@ separates them because it has enough registers to pin nearly all constants.
-@ armv7 does not have enough registers, but needing explicit loads and stores
-@ also complicates using x86_64's register allocation directly.
-@
-@ We pin some constants for convenience and leave q14 and q15 free to load
-@ others on demand.
-
-@
-@  Key schedule constants
-@
-
-.align	4
-_vpaes_key_consts:
-Lk_dksd:@ decryption key schedule: invskew x*D
-.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
-.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
-Lk_dksb:@ decryption key schedule: invskew x*B
-.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
-.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
-Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
-.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
-.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
-Lk_dks9:@ decryption key schedule: invskew x*9
-.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
-.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
-
-Lk_rcon:@ rcon
-.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
-
-Lk_opt:@ output transform
-.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
-.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
-Lk_deskew:@ deskew tables: inverts the sbox's "skew"
-.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
-.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
-
-
-#ifdef __thumb2__
-.thumb_func	_vpaes_key_preheat
-#endif
-.align	4
-_vpaes_key_preheat:
-	adr	r11, Lk_rcon
-	vmov.i8	q12, #0x5b			@ Lk_s63
-	adr	r10, Lk_inv			@ Must be aligned to 8 mod 16.
-	vmov.i8	q9, #0x0f			@ Lk_s0F
-	vld1.64	{q10,q11}, [r10]		@ Lk_inv
-	vld1.64	{q8}, [r11]			@ Lk_rcon
-	bx	lr
-
-
-#ifdef __thumb2__
-.thumb_func	_vpaes_schedule_core
-#endif
-.align	4
-_vpaes_schedule_core:
-	@ We only need to save lr, but ARM requires an 8-byte stack alignment,
-	@ so save an extra register.
-	stmdb	sp!, {r3,lr}
-
-	bl	_vpaes_key_preheat	@ load the tables
-
-	adr	r11, Lk_ipt		@ Must be aligned to 8 mod 16.
-	vld1.64	{q0}, [r0]!		@ vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
-
-	@ input transform
-	@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
-	@ overlap table and destination.
-	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm3
-	bl	_vpaes_schedule_transform
-	adr	r10, Lk_sr		@ Must be aligned to 8 mod 16.
-	vmov	q7, q0			@ vmovdqa	%xmm0,	%xmm7
-
-	add	r8, r8, r10
-	tst	r3, r3
-	bne	Lschedule_am_decrypting
-
-	@ encrypting, output zeroth round key after transform
-	vst1.64	{q0}, [r2]		@ vmovdqu	%xmm0,	(%rdx)
-	b	Lschedule_go
-
-Lschedule_am_decrypting:
-	@ decrypting, output zeroth round key after shiftrows
-	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
-	vtbl.8	d6, {q4}, d2	@ vpshufb  	%xmm1,	%xmm3,	%xmm3
-	vtbl.8	d7, {q4}, d3
-	vst1.64	{q3}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
-	eor	r8, r8, #0x30		@ xor	$0x30, %r8
-
-Lschedule_go:
-	cmp	r1, #192		@ cmp	$192,	%esi
-	bhi	Lschedule_256
-	beq	Lschedule_192
-	@ 128: fall though
-
-@@
-@@  .schedule_128
-@@
-@@  128-bit specific part of key schedule.
-@@
-@@  This schedule is really simple, because all its parts
-@@  are accomplished by the subroutines.
-@@
-Lschedule_128:
-	mov	r0, #10		@ mov	$10, %esi
-
-Loop_schedule_128:
-	bl	_vpaes_schedule_round
-	subs	r0, r0, #1		@ dec	%esi
-	beq	Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle	@ write output
-	b	Loop_schedule_128
-
-@@
-@@  .aes_schedule_192
-@@
-@@  192-bit specific part of key schedule.
-@@
-@@  The main body of this schedule is the same as the 128-bit
-@@  schedule, but with more smearing.  The long, high side is
-@@  stored in q7 as before, and the short, low side is in
-@@  the high bits of q6.
-@@
-@@  This schedule is somewhat nastier, however, because each
-@@  round produces 192 bits of key material, or 1.5 round keys.
-@@  Therefore, on each cycle we do 2 rounds and produce 3 round
-@@  keys.
-@@
-.align	4
-Lschedule_192:
-	sub	r0, r0, #8
-	vld1.64	{q0}, [r0]			@ vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
-	bl	_vpaes_schedule_transform	@ input transform
-	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save short part
-	vmov.i8	d12, #0			@ vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
-						@ vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
-	mov	r0, #4			@ mov	$4,	%esi
-
-Loop_schedule_192:
-	bl	_vpaes_schedule_round
-	vext.8	q0, q6, q0, #8			@ vpalignr	$8,%xmm6,%xmm0,%xmm0
-	bl	_vpaes_schedule_mangle		@ save key n
-	bl	_vpaes_schedule_192_smear
-	bl	_vpaes_schedule_mangle		@ save key n+1
-	bl	_vpaes_schedule_round
-	subs	r0, r0, #1			@ dec	%esi
-	beq	Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle		@ save key n+2
-	bl	_vpaes_schedule_192_smear
-	b	Loop_schedule_192
-
-@@
-@@  .aes_schedule_256
-@@
-@@  256-bit specific part of key schedule.
-@@
-@@  The structure here is very similar to the 128-bit
-@@  schedule, but with an additional "low side" in
-@@  q6.  The low side's rounds are the same as the
-@@  high side's, except no rcon and no rotation.
-@@
-.align	4
-Lschedule_256:
-	vld1.64	{q0}, [r0]			@ vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
-	bl	_vpaes_schedule_transform	@ input transform
-	mov	r0, #7			@ mov	$7, %esi
-
-Loop_schedule_256:
-	bl	_vpaes_schedule_mangle		@ output low result
-	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
-
-	@ high round
-	bl	_vpaes_schedule_round
-	subs	r0, r0, #1			@ dec	%esi
-	beq	Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle
-
-	@ low round. swap xmm7 and xmm6
-	vdup.32	q0, d1[1]		@ vpshufd	$0xFF,	%xmm0,	%xmm0
-	vmov.i8	q4, #0
-	vmov	q5, q7			@ vmovdqa	%xmm7,	%xmm5
-	vmov	q7, q6			@ vmovdqa	%xmm6,	%xmm7
-	bl	_vpaes_schedule_low_round
-	vmov	q7, q5			@ vmovdqa	%xmm5,	%xmm7
-
-	b	Loop_schedule_256
-
-@@
-@@  .aes_schedule_mangle_last
-@@
-@@  Mangler for last round of key schedule
-@@  Mangles q0
-@@    when encrypting, outputs out(q0) ^ 63
-@@    when decrypting, outputs unskew(q0)
-@@
-@@  Always called right before return... jumps to cleanup and exits
-@@
-.align	4
-Lschedule_mangle_last:
-	@ schedule last round key from xmm0
-	adr	r11, Lk_deskew			@ lea	Lk_deskew(%rip),%r11	# prepare to deskew
-	tst	r3, r3
-	bne	Lschedule_mangle_last_dec
-
-	@ encrypting
-	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),%xmm1
-	adr	r11, Lk_opt		@ lea		Lk_opt(%rip),	%r11		# prepare to output transform
-	add	r2, r2, #32		@ add		$32,	%rdx
-	vmov	q2, q0
-	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
-	vtbl.8	d1, {q2}, d3
-
-Lschedule_mangle_last_dec:
-	sub	r2, r2, #16			@ add	$-16,	%rdx
-	veor	q0, q0, q12			@ vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
-	bl	_vpaes_schedule_transform	@ output transform
-	vst1.64	{q0}, [r2]			@ vmovdqu	%xmm0,	(%rdx)		# save last key
-
-	@ cleanup
-	veor	q0, q0, q0		@ vpxor	%xmm0,	%xmm0,	%xmm0
-	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
-	veor	q2, q2, q2		@ vpxor	%xmm2,	%xmm2,	%xmm2
-	veor	q3, q3, q3		@ vpxor	%xmm3,	%xmm3,	%xmm3
-	veor	q4, q4, q4		@ vpxor	%xmm4,	%xmm4,	%xmm4
-	veor	q5, q5, q5		@ vpxor	%xmm5,	%xmm5,	%xmm5
-	veor	q6, q6, q6		@ vpxor	%xmm6,	%xmm6,	%xmm6
-	veor	q7, q7, q7		@ vpxor	%xmm7,	%xmm7,	%xmm7
-	ldmia	sp!, {r3,pc}		@ return
-
-
-@@
-@@  .aes_schedule_192_smear
-@@
-@@  Smear the short, low side in the 192-bit key schedule.
-@@
-@@  Inputs:
-@@    q7: high side, b  a  x  y
-@@    q6:  low side, d  c  0  0
-@@
-@@  Outputs:
-@@    q6: b+c+d  b+c  0  0
-@@    q0: b+c+d  b+c  b  a
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_schedule_192_smear
-#endif
-.align	4
-_vpaes_schedule_192_smear:
-	vmov.i8	q1, #0
-	vdup.32	q0, d15[1]
-	vshl.i64	q1, q6, #32		@ vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
-	vmov	d0, d15		@ vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
-	veor	q6, q6, q1		@ vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
-	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
-	veor	q6, q6, q0		@ vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
-	vmov	q0, q6			@ vmovdqa	%xmm6,	%xmm0
-	vmov	d12, d2		@ vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
-	bx	lr
-
-
-@@
-@@  .aes_schedule_round
-@@
-@@  Runs one main round of the key schedule on q0, q7
-@@
-@@  Specifically, runs subbytes on the high dword of q0
-@@  then rotates it by one byte and xors into the low dword of
-@@  q7.
-@@
-@@  Adds rcon from low byte of q8, then rotates q8 for
-@@  next rcon.
-@@
-@@  Smears the dwords of q7 by xoring the low into the
-@@  second low, result into third, result into highest.
-@@
-@@  Returns results in q7 = q0.
-@@  Clobbers q1-q4, r11.
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_schedule_round
-#endif
-.align	4
-_vpaes_schedule_round:
-	@ extract rcon from xmm8
-	vmov.i8	q4, #0				@ vpxor		%xmm4,	%xmm4,	%xmm4
-	vext.8	q1, q8, q4, #15		@ vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
-	vext.8	q8, q8, q8, #15	@ vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
-	veor	q7, q7, q1			@ vpxor		%xmm1,	%xmm7,	%xmm7
-
-	@ rotate
-	vdup.32	q0, d1[1]			@ vpshufd	$0xFF,	%xmm0,	%xmm0
-	vext.8	q0, q0, q0, #1			@ vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
-
-	@ fall through...
-
-	@ low round: same as high round, but no rotation and no rcon.
-_vpaes_schedule_low_round:
-	@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
-	@ We pin other values in _vpaes_key_preheat, so load them now.
-	adr	r11, Lk_sb1
-	vld1.64	{q14,q15}, [r11]
-
-	@ smear xmm7
-	vext.8	q1, q4, q7, #12			@ vpslldq	$4,	%xmm7,	%xmm1
-	veor	q7, q7, q1			@ vpxor	%xmm1,	%xmm7,	%xmm7
-	vext.8	q4, q4, q7, #8			@ vpslldq	$8,	%xmm7,	%xmm4
-
-	@ subbytes
-	vand	q1, q0, q9			@ vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
-	vshr.u8	q0, q0, #4			@ vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
-	veor	q7, q7, q4			@ vpxor		%xmm4,	%xmm7,	%xmm7
-	vtbl.8	d4, {q11}, d2		@ vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
-	vtbl.8	d5, {q11}, d3
-	veor	q1, q1, q0			@ vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
-	vtbl.8	d6, {q10}, d0		@ vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
-	vtbl.8	d7, {q10}, d1
-	veor	q3, q3, q2			@ vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
-	vtbl.8	d8, {q10}, d2		@ vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
-	vtbl.8	d9, {q10}, d3
-	veor	q7, q7, q12			@ vpxor		Lk_s63(%rip),	%xmm7,	%xmm7
-	vtbl.8	d6, {q10}, d6		@ vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
-	vtbl.8	d7, {q10}, d7
-	veor	q4, q4, q2			@ vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
-	vtbl.8	d4, {q10}, d8		@ vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
-	vtbl.8	d5, {q10}, d9
-	veor	q3, q3, q1			@ vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
-	veor	q2, q2, q0			@ vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
-	vtbl.8	d8, {q15}, d6		@ vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
-	vtbl.8	d9, {q15}, d7
-	vtbl.8	d2, {q14}, d4		@ vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
-	vtbl.8	d3, {q14}, d5
-	veor	q1, q1, q4			@ vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
-
-	@ add in smeared stuff
-	veor	q0, q1, q7			@ vpxor	%xmm7,	%xmm1,	%xmm0
-	veor	q7, q1, q7			@ vmovdqa	%xmm0,	%xmm7
-	bx	lr
-
-
-@@
-@@  .aes_schedule_transform
-@@
-@@  Linear-transform q0 according to tables at [r11]
-@@
-@@  Requires that q9 = 0x0F0F... as in preheat
-@@  Output in q0
-@@  Clobbers q1, q2, q14, q15
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_schedule_transform
-#endif
-.align	4
-_vpaes_schedule_transform:
-	vld1.64	{q14,q15}, [r11]	@ vmovdqa	(%r11),	%xmm2 	# lo
-					@ vmovdqa	16(%r11),	%xmm1 # hi
-	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
-	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
-	vtbl.8	d4, {q14}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
-	vtbl.8	d5, {q14}, d3
-	vtbl.8	d0, {q15}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
-	vtbl.8	d1, {q15}, d1
-	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
-	bx	lr
-
-
-@@
-@@  .aes_schedule_mangle
-@@
-@@  Mangles q0 from (basis-transformed) standard version
-@@  to our version.
-@@
-@@  On encrypt,
-@@    xor with 0x63
-@@    multiply by circulant 0,1,1,1
-@@    apply shiftrows transform
-@@
-@@  On decrypt,
-@@    xor with 0x63
-@@    multiply by "inverse mixcolumns" circulant E,B,D,9
-@@    deskew
-@@    apply shiftrows transform
-@@
-@@
-@@  Writes out to [r2], and increments or decrements it
-@@  Keeps track of round number mod 4 in r8
-@@  Preserves q0
-@@  Clobbers q1-q5
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_schedule_mangle
-#endif
-.align	4
-_vpaes_schedule_mangle:
-	tst	r3, r3
-	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
-	adr	r11, Lk_mc_forward	@ Must be aligned to 8 mod 16.
-	vld1.64	{q5}, [r11]		@ vmovdqa	Lk_mc_forward(%rip),%xmm5
-	bne	Lschedule_mangle_dec
-
-	@ encrypting
-	@ Write to q2 so we do not overlap table and destination below.
-	veor	q2, q0, q12		@ vpxor		Lk_s63(%rip),	%xmm0,	%xmm4
-	add	r2, r2, #16		@ add		$16,	%rdx
-	vtbl.8	d8, {q2}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm4
-	vtbl.8	d9, {q2}, d11
-	vtbl.8	d2, {q4}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm1
-	vtbl.8	d3, {q4}, d11
-	vtbl.8	d6, {q1}, d10	@ vpshufb	%xmm5,	%xmm1,	%xmm3
-	vtbl.8	d7, {q1}, d11
-	veor	q4, q4, q1		@ vpxor		%xmm1,	%xmm4,	%xmm4
-	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
-	veor	q3, q3, q4		@ vpxor		%xmm4,	%xmm3,	%xmm3
-
-	b	Lschedule_mangle_both
-.align	4
-Lschedule_mangle_dec:
-	@ inverse mix columns
-	adr	r11, Lk_dksd 		@ lea		Lk_dksd(%rip),%r11
-	vshr.u8	q1, q4, #4		@ vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
-	vand	q4, q4, q9		@ vpand		%xmm9,	%xmm4,	%xmm4	# 4 = lo
-
-	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x00(%r11),	%xmm2
-					@ vmovdqa	0x10(%r11),	%xmm3
-	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
-	vtbl.8	d5, {q14}, d9
-	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
-	vtbl.8	d7, {q15}, d3
-	@ Load .Lk_dksb ahead of time.
-	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x20(%r11),	%xmm2
-					@ vmovdqa	0x30(%r11),	%xmm3
-	@ Write to q13 so we do not overlap table and destination.
-	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
-	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
-	vtbl.8	d7, {q13}, d11
-
-	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
-	vtbl.8	d5, {q14}, d9
-	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
-	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
-	vtbl.8	d7, {q15}, d3
-	@ Load .Lk_dkse ahead of time.
-	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x40(%r11),	%xmm2
-					@ vmovdqa	0x50(%r11),	%xmm3
-	@ Write to q13 so we do not overlap table and destination.
-	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
-	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
-	vtbl.8	d7, {q13}, d11
-
-	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
-	vtbl.8	d5, {q14}, d9
-	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
-	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
-	vtbl.8	d7, {q15}, d3
-	@ Load .Lk_dkse ahead of time.
-	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x60(%r11),	%xmm2
-					@ vmovdqa	0x70(%r11),	%xmm4
-	@ Write to q13 so we do not overlap table and destination.
-	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
-
-	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
-	vtbl.8	d5, {q14}, d9
-	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
-	vtbl.8	d7, {q13}, d11
-	vtbl.8	d8, {q15}, d2	@ vpshufb	%xmm1,	%xmm4,	%xmm4
-	vtbl.8	d9, {q15}, d3
-	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
-	veor	q2, q2, q3		@ vpxor	%xmm3,	%xmm2,	%xmm2
-	veor	q3, q4, q2		@ vpxor	%xmm2,	%xmm4,	%xmm3
-
-	sub	r2, r2, #16		@ add	$-16,	%rdx
-
-Lschedule_mangle_both:
-	@ Write to q2 so table and destination do not overlap.
-	vtbl.8	d4, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
-	vtbl.8	d5, {q3}, d3
-	add	r8, r8, #64-16		@ add	$-16,	%r8
-	and	r8, r8, #~(1<<6)	@ and	$0x30,	%r8
-	vst1.64	{q2}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
-	bx	lr
-
-
-.globl	_vpaes_set_encrypt_key
-.private_extern	_vpaes_set_encrypt_key
-#ifdef __thumb2__
-.thumb_func	_vpaes_set_encrypt_key
-#endif
-.align	4
-_vpaes_set_encrypt_key:
-	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
-	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
-	lsr	r9, r1, #5		@ shr	$5,%eax
-	add	r9, r9, #5		@ $5,%eax
-	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
-
-	mov	r3, #0		@ mov	$0,%ecx
-	mov	r8, #0x30		@ mov	$0x30,%r8d
-	bl	_vpaes_schedule_core
-	eor	r0, r0, r0
-
-	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
-
-
-.globl	_vpaes_set_decrypt_key
-.private_extern	_vpaes_set_decrypt_key
-#ifdef __thumb2__
-.thumb_func	_vpaes_set_decrypt_key
-#endif
-.align	4
-_vpaes_set_decrypt_key:
-	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
-	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
-	lsr	r9, r1, #5		@ shr	$5,%eax
-	add	r9, r9, #5		@ $5,%eax
-	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
-	lsl	r9, r9, #4		@ shl	$4,%eax
-	add	r2, r2, #16		@ lea	16(%rdx,%rax),%rdx
-	add	r2, r2, r9
-
-	mov	r3, #1		@ mov	$1,%ecx
-	lsr	r8, r1, #1		@ shr	$1,%r8d
-	and	r8, r8, #32		@ and	$32,%r8d
-	eor	r8, r8, #32		@ xor	$32,%r8d	# nbits==192?0:32
-	bl	_vpaes_schedule_core
-
-	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
-
-
-@ Additional constants for converting to bsaes.
-
-.align	4
-_vpaes_convert_consts:
-@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
-@ transform in the AES S-box. 0x63 is incorporated into the low half of the
-@ table. This was computed with the following script:
-@
-@   def u64s_to_u128(x, y):
-@       return x | (y << 64)
-@   def u128_to_u64s(w):
-@       return w & ((1<<64)-1), w >> 64
-@   def get_byte(w, i):
-@       return (w >> (i*8)) & 0xff
-@   def apply_table(table, b):
-@       lo = b & 0xf
-@       hi = b >> 4
-@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
-@   def opt(b):
-@       table = [
-@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
-@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
-@       ]
-@       return apply_table(table, b)
-@   def rot_byte(b, n):
-@       return 0xff & ((b << n) | (b >> (8-n)))
-@   def skew(x):
-@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
-@               rot_byte(x, 4))
-@   table = [0, 0]
-@   for i in range(16):
-@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
-@       table[1] |= skew(opt(i<<4)) << (i*8)
-@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[0]))
-@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[1]))
-Lk_opt_then_skew:
-.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
-.quad	0x1f30062936192f00, 0xb49bad829db284ab
-
-@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
-@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
-@ becomes 0x22334411 and then 0x11443322.
-Lk_decrypt_transform:
-.quad	0x0704050603000102, 0x0f0c0d0e0b08090a
-
-
-@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
-.globl	_vpaes_encrypt_key_to_bsaes
-.private_extern	_vpaes_encrypt_key_to_bsaes
-#ifdef __thumb2__
-.thumb_func	_vpaes_encrypt_key_to_bsaes
-#endif
-.align	4
-_vpaes_encrypt_key_to_bsaes:
-	stmdb	sp!, {r11, lr}
-
-	@ See _vpaes_schedule_core for the key schedule logic. In particular,
-	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
-	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
-	@ contain the transformations not in the bsaes representation. This
-	@ function inverts those transforms.
-	@
-	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
-	@ representation, which does not match the other aes_nohw_*
-	@ implementations. The ARM aes_nohw_* stores each 32-bit word
-	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
-	@ cost of extra REV and VREV32 operations in little-endian ARM.
-
-	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
-	adr	r2, Lk_mc_forward	@ Must be aligned to 8 mod 16.
-	add	r3, r2, 0x90		@ Lk_sr+0x10-Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
-
-	vld1.64	{q12}, [r2]
-	vmov.i8	q10, #0x5b		@ Lk_s63 from vpaes-x86_64
-	adr	r11, Lk_opt		@ Must be aligned to 8 mod 16.
-	vmov.i8	q11, #0x63		@ LK_s63 without Lk_ipt applied
-
-	@ vpaes stores one fewer round count than bsaes, but the number of keys
-	@ is the same.
-	ldr	r2, [r1,#240]
-	add	r2, r2, #1
-	str	r2, [r0,#240]
-
-	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
-	@ Invert this with .Lk_opt.
-	vld1.64	{q0}, [r1]!
-	bl	_vpaes_schedule_transform
-	vrev32.8	q0, q0
-	vst1.64	{q0}, [r0]!
-
-	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
-	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
-	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
-Loop_enc_key_to_bsaes:
-	vld1.64	{q0}, [r1]!
-
-	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
-	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
-	@ We use r3 rather than r8 to avoid a callee-saved register.
-	vld1.64	{q1}, [r3]
-	vtbl.8	d4, {q0}, d2
-	vtbl.8	d5, {q0}, d3
-	add	r3, r3, #16
-	and	r3, r3, #~(1<<6)
-	vmov	q0, q2
-
-	@ Handle the last key differently.
-	subs	r2, r2, #1
-	beq	Loop_enc_key_to_bsaes_last
-
-	@ Multiply by the circulant. This is its own inverse.
-	vtbl.8	d2, {q0}, d24
-	vtbl.8	d3, {q0}, d25
-	vmov	q0, q1
-	vtbl.8	d4, {q1}, d24
-	vtbl.8	d5, {q1}, d25
-	veor	q0, q0, q2
-	vtbl.8	d2, {q2}, d24
-	vtbl.8	d3, {q2}, d25
-	veor	q0, q0, q1
-
-	@ XOR and finish.
-	veor	q0, q0, q10
-	bl	_vpaes_schedule_transform
-	vrev32.8	q0, q0
-	vst1.64	{q0}, [r0]!
-	b	Loop_enc_key_to_bsaes
-
-Loop_enc_key_to_bsaes_last:
-	@ The final key does not have a basis transform (note
-	@ .Lschedule_mangle_last inverts the original transform). It only XORs
-	@ 0x63 and applies ShiftRows. The latter was already inverted in the
-	@ loop. Note that, because we act on the original representation, we use
-	@ q11, not q10.
-	veor	q0, q0, q11
-	vrev32.8	q0, q0
-	vst1.64	{q0}, [r0]
-
-	@ Wipe registers which contained key material.
-	veor	q0, q0, q0
-	veor	q1, q1, q1
-	veor	q2, q2, q2
-
-	ldmia	sp!, {r11, pc}	@ return
-
-
-@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
-.globl	_vpaes_decrypt_key_to_bsaes
-.private_extern	_vpaes_decrypt_key_to_bsaes
-#ifdef __thumb2__
-.thumb_func	_vpaes_decrypt_key_to_bsaes
-#endif
-.align	4
-_vpaes_decrypt_key_to_bsaes:
-	stmdb	sp!, {r11, lr}
-
-	@ See _vpaes_schedule_core for the key schedule logic. Note vpaes
-	@ computes the decryption key schedule in reverse. Additionally,
-	@ aes-x86_64.pl shares some transformations, so we must only partially
-	@ invert vpaes's transformations. In general, vpaes computes in a
-	@ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
-	@ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
-	@ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
-	@
-	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
-	@ representation, which does not match the other aes_nohw_*
-	@ implementations. The ARM aes_nohw_* stores each 32-bit word
-	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
-	@ cost of extra REV and VREV32 operations in little-endian ARM.
-
-	adr	r2, Lk_decrypt_transform
-	adr	r3, Lk_sr+0x30
-	adr	r11, Lk_opt_then_skew	@ Input to _vpaes_schedule_transform.
-	vld1.64	{q12}, [r2]	@ Reuse q12 from encryption.
-	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
-
-	@ vpaes stores one fewer round count than bsaes, but the number of keys
-	@ is the same.
-	ldr	r2, [r1,#240]
-	add	r2, r2, #1
-	str	r2, [r0,#240]
-
-	@ Undo the basis change and reapply the S-box affine transform. See
-	@ .Lschedule_mangle_last.
-	vld1.64	{q0}, [r1]!
-	bl	_vpaes_schedule_transform
-	vrev32.8	q0, q0
-	vst1.64	{q0}, [r0]!
-
-	@ See _vpaes_schedule_mangle for the transform on the middle keys. Note
-	@ it simultaneously inverts MixColumns and the S-box affine transform.
-	@ See .Lk_dksd through .Lk_dks9.
-Loop_dec_key_to_bsaes:
-	vld1.64	{q0}, [r1]!
-
-	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
-	@ forwards cancels inverting for which direction we cycle r3. We use r3
-	@ rather than r8 to avoid a callee-saved register.
-	vld1.64	{q1}, [r3]
-	vtbl.8	d4, {q0}, d2
-	vtbl.8	d5, {q0}, d3
-	add	r3, r3, #64-16
-	and	r3, r3, #~(1<<6)
-	vmov	q0, q2
-
-	@ Handle the last key differently.
-	subs	r2, r2, #1
-	beq	Loop_dec_key_to_bsaes_last
-
-	@ Undo the basis change and reapply the S-box affine transform.
-	bl	_vpaes_schedule_transform
-
-	@ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
-	@ combine the two operations in .Lk_decrypt_transform.
-	@
-	@ TODO(davidben): Where does the rotation come from?
-	vtbl.8	d2, {q0}, d24
-	vtbl.8	d3, {q0}, d25
-
-	vst1.64	{q1}, [r0]!
-	b	Loop_dec_key_to_bsaes
-
-Loop_dec_key_to_bsaes_last:
-	@ The final key only inverts ShiftRows (already done in the loop). See
-	@ .Lschedule_am_decrypting. Its basis is not transformed.
-	vrev32.8	q0, q0
-	vst1.64	{q0}, [r0]!
-
-	@ Wipe registers which contained key material.
-	veor	q0, q0, q0
-	veor	q1, q1, q1
-	veor	q2, q2, q2
-
-	ldmia	sp!, {r11, pc}	@ return
-
-.globl	_vpaes_ctr32_encrypt_blocks
-.private_extern	_vpaes_ctr32_encrypt_blocks
-#ifdef __thumb2__
-.thumb_func	_vpaes_ctr32_encrypt_blocks
-#endif
-.align	4
-_vpaes_ctr32_encrypt_blocks:
-	mov	ip, sp
-	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
-	@ This function uses q4-q7 (d8-d15), which are callee-saved.
-	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
-	cmp	r2, #0
-	@ r8 is passed on the stack.
-	ldr	r8, [ip]
-	beq	Lctr32_done
-
-	@ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
-	mov	r9, r3
-	mov	r3, r2
-	mov	r2, r9
-
-	@ Load the IV and counter portion.
-	ldr	r7, [r8, #12]
-	vld1.8	{q7}, [r8]
-
-	bl	_vpaes_preheat
-	rev	r7, r7		@ The counter is big-endian.
-
-Lctr32_loop:
-	vmov	q0, q7
-	vld1.8	{q6}, [r0]!		@ Load input ahead of time
-	bl	_vpaes_encrypt_core
-	veor	q0, q0, q6		@ XOR input and result
-	vst1.8	{q0}, [r1]!
-	subs	r3, r3, #1
-	@ Update the counter.
-	add	r7, r7, #1
-	rev	r9, r7
-	vmov.32	d15[1], r9
-	bne	Lctr32_loop
-
-Lctr32_done:
-	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
-
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
-#endif  // defined(__arm__) && defined(__APPLE__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-windows.windows.x86.S
deleted file mode 100644
index e0c1039a..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-windows.windows.x86.S
+++ /dev/null
@@ -1,686 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-%ifdef BORINGSSL_DISPATCH_TEST
-extern	_BORINGSSL_function_hit
-%endif
-align	64
-L$_vpaes_consts:
-dd	218628480,235210255,168496130,67568393
-dd	252381056,17041926,33884169,51187212
-dd	252645135,252645135,252645135,252645135
-dd	1512730624,3266504856,1377990664,3401244816
-dd	830229760,1275146365,2969422977,3447763452
-dd	3411033600,2979783055,338359620,2782886510
-dd	4209124096,907596821,221174255,1006095553
-dd	191964160,3799684038,3164090317,1589111125
-dd	182528256,1777043520,2877432650,3265356744
-dd	1874708224,3503451415,3305285752,363511674
-dd	1606117888,3487855781,1093350906,2384367825
-dd	197121,67569157,134941193,202313229
-dd	67569157,134941193,202313229,197121
-dd	134941193,202313229,197121,67569157
-dd	202313229,197121,67569157,134941193
-dd	33619971,100992007,168364043,235736079
-dd	235736079,33619971,100992007,168364043
-dd	168364043,235736079,33619971,100992007
-dd	100992007,168364043,235736079,33619971
-dd	50462976,117835012,185207048,252579084
-dd	252314880,51251460,117574920,184942860
-dd	184682752,252054788,50987272,118359308
-dd	118099200,185467140,251790600,50727180
-dd	2946363062,528716217,1300004225,1881839624
-dd	1532713819,1532713819,1532713819,1532713819
-dd	3602276352,4288629033,3737020424,4153884961
-dd	1354558464,32357713,2958822624,3775749553
-dd	1201988352,132424512,1572796698,503232858
-dd	2213177600,1597421020,4103937655,675398315
-dd	2749646592,4273543773,1511898873,121693092
-dd	3040248576,1103263732,2871565598,1608280554
-dd	2236667136,2588920351,482954393,64377734
-dd	3069987328,291237287,2117370568,3650299247
-dd	533321216,3573750986,2572112006,1401264716
-dd	1339849704,2721158661,548607111,3445553514
-dd	2128193280,3054596040,2183486460,1257083700
-dd	655635200,1165381986,3923443150,2344132524
-dd	190078720,256924420,290342170,357187870
-dd	1610966272,2263057382,4103205268,309794674
-dd	2592527872,2233205587,1335446729,3402964816
-dd	3973531904,3225098121,3002836325,1918774430
-dd	3870401024,2102906079,2284471353,4117666579
-dd	617007872,1021508343,366931923,691083277
-dd	2528395776,3491914898,2968704004,1613121270
-dd	3445188352,3247741094,844474987,4093578302
-dd	651481088,1190302358,1689581232,574775300
-dd	4289380608,206939853,2555985458,2489840491
-dd	2130264064,327674451,3566485037,3349835193
-dd	2470714624,316102159,3636825756,3393945945
-db	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
-db	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
-db	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
-db	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
-db	118,101,114,115,105,116,121,41,0
-align	64
-align	16
-__vpaes_preheat:
-	add	ebp,DWORD [esp]
-	movdqa	xmm7,[ebp-48]
-	movdqa	xmm6,[ebp-16]
-	ret
-align	16
-__vpaes_encrypt_core:
-	mov	ecx,16
-	mov	eax,DWORD [240+edx]
-	movdqa	xmm1,xmm6
-	movdqa	xmm2,[ebp]
-	pandn	xmm1,xmm0
-	pand	xmm0,xmm6
-	movdqu	xmm5,[edx]
-db	102,15,56,0,208
-	movdqa	xmm0,[16+ebp]
-	pxor	xmm2,xmm5
-	psrld	xmm1,4
-	add	edx,16
-db	102,15,56,0,193
-	lea	ebx,[192+ebp]
-	pxor	xmm0,xmm2
-	jmp	NEAR L$000enc_entry
-align	16
-L$001enc_loop:
-	movdqa	xmm4,[32+ebp]
-	movdqa	xmm0,[48+ebp]
-db	102,15,56,0,226
-db	102,15,56,0,195
-	pxor	xmm4,xmm5
-	movdqa	xmm5,[64+ebp]
-	pxor	xmm0,xmm4
-	movdqa	xmm1,[ecx*1+ebx-64]
-db	102,15,56,0,234
-	movdqa	xmm2,[80+ebp]
-	movdqa	xmm4,[ecx*1+ebx]
-db	102,15,56,0,211
-	movdqa	xmm3,xmm0
-	pxor	xmm2,xmm5
-db	102,15,56,0,193
-	add	edx,16
-	pxor	xmm0,xmm2
-db	102,15,56,0,220
-	add	ecx,16
-	pxor	xmm3,xmm0
-db	102,15,56,0,193
-	and	ecx,48
-	sub	eax,1
-	pxor	xmm0,xmm3
-L$000enc_entry:
-	movdqa	xmm1,xmm6
-	movdqa	xmm5,[ebp-32]
-	pandn	xmm1,xmm0
-	psrld	xmm1,4
-	pand	xmm0,xmm6
-db	102,15,56,0,232
-	movdqa	xmm3,xmm7
-	pxor	xmm0,xmm1
-db	102,15,56,0,217
-	movdqa	xmm4,xmm7
-	pxor	xmm3,xmm5
-db	102,15,56,0,224
-	movdqa	xmm2,xmm7
-	pxor	xmm4,xmm5
-db	102,15,56,0,211
-	movdqa	xmm3,xmm7
-	pxor	xmm2,xmm0
-db	102,15,56,0,220
-	movdqu	xmm5,[edx]
-	pxor	xmm3,xmm1
-	jnz	NEAR L$001enc_loop
-	movdqa	xmm4,[96+ebp]
-	movdqa	xmm0,[112+ebp]
-db	102,15,56,0,226
-	pxor	xmm4,xmm5
-db	102,15,56,0,195
-	movdqa	xmm1,[64+ecx*1+ebx]
-	pxor	xmm0,xmm4
-db	102,15,56,0,193
-	ret
-align	16
-__vpaes_decrypt_core:
-	lea	ebx,[608+ebp]
-	mov	eax,DWORD [240+edx]
-	movdqa	xmm1,xmm6
-	movdqa	xmm2,[ebx-64]
-	pandn	xmm1,xmm0
-	mov	ecx,eax
-	psrld	xmm1,4
-	movdqu	xmm5,[edx]
-	shl	ecx,4
-	pand	xmm0,xmm6
-db	102,15,56,0,208
-	movdqa	xmm0,[ebx-48]
-	xor	ecx,48
-db	102,15,56,0,193
-	and	ecx,48
-	pxor	xmm2,xmm5
-	movdqa	xmm5,[176+ebp]
-	pxor	xmm0,xmm2
-	add	edx,16
-	lea	ecx,[ecx*1+ebx-352]
-	jmp	NEAR L$002dec_entry
-align	16
-L$003dec_loop:
-	movdqa	xmm4,[ebx-32]
-	movdqa	xmm1,[ebx-16]
-db	102,15,56,0,226
-db	102,15,56,0,203
-	pxor	xmm0,xmm4
-	movdqa	xmm4,[ebx]
-	pxor	xmm0,xmm1
-	movdqa	xmm1,[16+ebx]
-db	102,15,56,0,226
-db	102,15,56,0,197
-db	102,15,56,0,203
-	pxor	xmm0,xmm4
-	movdqa	xmm4,[32+ebx]
-	pxor	xmm0,xmm1
-	movdqa	xmm1,[48+ebx]
-db	102,15,56,0,226
-db	102,15,56,0,197
-db	102,15,56,0,203
-	pxor	xmm0,xmm4
-	movdqa	xmm4,[64+ebx]
-	pxor	xmm0,xmm1
-	movdqa	xmm1,[80+ebx]
-db	102,15,56,0,226
-db	102,15,56,0,197
-db	102,15,56,0,203
-	pxor	xmm0,xmm4
-	add	edx,16
-db	102,15,58,15,237,12
-	pxor	xmm0,xmm1
-	sub	eax,1
-L$002dec_entry:
-	movdqa	xmm1,xmm6
-	movdqa	xmm2,[ebp-32]
-	pandn	xmm1,xmm0
-	pand	xmm0,xmm6
-	psrld	xmm1,4
-db	102,15,56,0,208
-	movdqa	xmm3,xmm7
-	pxor	xmm0,xmm1
-db	102,15,56,0,217
-	movdqa	xmm4,xmm7
-	pxor	xmm3,xmm2
-db	102,15,56,0,224
-	pxor	xmm4,xmm2
-	movdqa	xmm2,xmm7
-db	102,15,56,0,211
-	movdqa	xmm3,xmm7
-	pxor	xmm2,xmm0
-db	102,15,56,0,220
-	movdqu	xmm0,[edx]
-	pxor	xmm3,xmm1
-	jnz	NEAR L$003dec_loop
-	movdqa	xmm4,[96+ebx]
-db	102,15,56,0,226
-	pxor	xmm4,xmm0
-	movdqa	xmm0,[112+ebx]
-	movdqa	xmm2,[ecx]
-db	102,15,56,0,195
-	pxor	xmm0,xmm4
-db	102,15,56,0,194
-	ret
-align	16
-__vpaes_schedule_core:
-	add	ebp,DWORD [esp]
-	movdqu	xmm0,[esi]
-	movdqa	xmm2,[320+ebp]
-	movdqa	xmm3,xmm0
-	lea	ebx,[ebp]
-	movdqa	[4+esp],xmm2
-	call	__vpaes_schedule_transform
-	movdqa	xmm7,xmm0
-	test	edi,edi
-	jnz	NEAR L$004schedule_am_decrypting
-	movdqu	[edx],xmm0
-	jmp	NEAR L$005schedule_go
-L$004schedule_am_decrypting:
-	movdqa	xmm1,[256+ecx*1+ebp]
-db	102,15,56,0,217
-	movdqu	[edx],xmm3
-	xor	ecx,48
-L$005schedule_go:
-	cmp	eax,192
-	ja	NEAR L$006schedule_256
-	je	NEAR L$007schedule_192
-L$008schedule_128:
-	mov	eax,10
-L$009loop_schedule_128:
-	call	__vpaes_schedule_round
-	dec	eax
-	jz	NEAR L$010schedule_mangle_last
-	call	__vpaes_schedule_mangle
-	jmp	NEAR L$009loop_schedule_128
-align	16
-L$007schedule_192:
-	movdqu	xmm0,[8+esi]
-	call	__vpaes_schedule_transform
-	movdqa	xmm6,xmm0
-	pxor	xmm4,xmm4
-	movhlps	xmm6,xmm4
-	mov	eax,4
-L$011loop_schedule_192:
-	call	__vpaes_schedule_round
-db	102,15,58,15,198,8
-	call	__vpaes_schedule_mangle
-	call	__vpaes_schedule_192_smear
-	call	__vpaes_schedule_mangle
-	call	__vpaes_schedule_round
-	dec	eax
-	jz	NEAR L$010schedule_mangle_last
-	call	__vpaes_schedule_mangle
-	call	__vpaes_schedule_192_smear
-	jmp	NEAR L$011loop_schedule_192
-align	16
-L$006schedule_256:
-	movdqu	xmm0,[16+esi]
-	call	__vpaes_schedule_transform
-	mov	eax,7
-L$012loop_schedule_256:
-	call	__vpaes_schedule_mangle
-	movdqa	xmm6,xmm0
-	call	__vpaes_schedule_round
-	dec	eax
-	jz	NEAR L$010schedule_mangle_last
-	call	__vpaes_schedule_mangle
-	pshufd	xmm0,xmm0,255
-	movdqa	[20+esp],xmm7
-	movdqa	xmm7,xmm6
-	call	L$_vpaes_schedule_low_round
-	movdqa	xmm7,[20+esp]
-	jmp	NEAR L$012loop_schedule_256
-align	16
-L$010schedule_mangle_last:
-	lea	ebx,[384+ebp]
-	test	edi,edi
-	jnz	NEAR L$013schedule_mangle_last_dec
-	movdqa	xmm1,[256+ecx*1+ebp]
-db	102,15,56,0,193
-	lea	ebx,[352+ebp]
-	add	edx,32
-L$013schedule_mangle_last_dec:
-	add	edx,-16
-	pxor	xmm0,[336+ebp]
-	call	__vpaes_schedule_transform
-	movdqu	[edx],xmm0
-	pxor	xmm0,xmm0
-	pxor	xmm1,xmm1
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	pxor	xmm4,xmm4
-	pxor	xmm5,xmm5
-	pxor	xmm6,xmm6
-	pxor	xmm7,xmm7
-	ret
-align	16
-__vpaes_schedule_192_smear:
-	pshufd	xmm1,xmm6,128
-	pshufd	xmm0,xmm7,254
-	pxor	xmm6,xmm1
-	pxor	xmm1,xmm1
-	pxor	xmm6,xmm0
-	movdqa	xmm0,xmm6
-	movhlps	xmm6,xmm1
-	ret
-align	16
-__vpaes_schedule_round:
-	movdqa	xmm2,[8+esp]
-	pxor	xmm1,xmm1
-db	102,15,58,15,202,15
-db	102,15,58,15,210,15
-	pxor	xmm7,xmm1
-	pshufd	xmm0,xmm0,255
-db	102,15,58,15,192,1
-	movdqa	[8+esp],xmm2
-L$_vpaes_schedule_low_round:
-	movdqa	xmm1,xmm7
-	pslldq	xmm7,4
-	pxor	xmm7,xmm1
-	movdqa	xmm1,xmm7
-	pslldq	xmm7,8
-	pxor	xmm7,xmm1
-	pxor	xmm7,[336+ebp]
-	movdqa	xmm4,[ebp-16]
-	movdqa	xmm5,[ebp-48]
-	movdqa	xmm1,xmm4
-	pandn	xmm1,xmm0
-	psrld	xmm1,4
-	pand	xmm0,xmm4
-	movdqa	xmm2,[ebp-32]
-db	102,15,56,0,208
-	pxor	xmm0,xmm1
-	movdqa	xmm3,xmm5
-db	102,15,56,0,217
-	pxor	xmm3,xmm2
-	movdqa	xmm4,xmm5
-db	102,15,56,0,224
-	pxor	xmm4,xmm2
-	movdqa	xmm2,xmm5
-db	102,15,56,0,211
-	pxor	xmm2,xmm0
-	movdqa	xmm3,xmm5
-db	102,15,56,0,220
-	pxor	xmm3,xmm1
-	movdqa	xmm4,[32+ebp]
-db	102,15,56,0,226
-	movdqa	xmm0,[48+ebp]
-db	102,15,56,0,195
-	pxor	xmm0,xmm4
-	pxor	xmm0,xmm7
-	movdqa	xmm7,xmm0
-	ret
-align	16
-__vpaes_schedule_transform:
-	movdqa	xmm2,[ebp-16]
-	movdqa	xmm1,xmm2
-	pandn	xmm1,xmm0
-	psrld	xmm1,4
-	pand	xmm0,xmm2
-	movdqa	xmm2,[ebx]
-db	102,15,56,0,208
-	movdqa	xmm0,[16+ebx]
-db	102,15,56,0,193
-	pxor	xmm0,xmm2
-	ret
-align	16
-__vpaes_schedule_mangle:
-	movdqa	xmm4,xmm0
-	movdqa	xmm5,[128+ebp]
-	test	edi,edi
-	jnz	NEAR L$014schedule_mangle_dec
-	add	edx,16
-	pxor	xmm4,[336+ebp]
-db	102,15,56,0,229
-	movdqa	xmm3,xmm4
-db	102,15,56,0,229
-	pxor	xmm3,xmm4
-db	102,15,56,0,229
-	pxor	xmm3,xmm4
-	jmp	NEAR L$015schedule_mangle_both
-align	16
-L$014schedule_mangle_dec:
-	movdqa	xmm2,[ebp-16]
-	lea	esi,[416+ebp]
-	movdqa	xmm1,xmm2
-	pandn	xmm1,xmm4
-	psrld	xmm1,4
-	pand	xmm4,xmm2
-	movdqa	xmm2,[esi]
-db	102,15,56,0,212
-	movdqa	xmm3,[16+esi]
-db	102,15,56,0,217
-	pxor	xmm3,xmm2
-db	102,15,56,0,221
-	movdqa	xmm2,[32+esi]
-db	102,15,56,0,212
-	pxor	xmm2,xmm3
-	movdqa	xmm3,[48+esi]
-db	102,15,56,0,217
-	pxor	xmm3,xmm2
-db	102,15,56,0,221
-	movdqa	xmm2,[64+esi]
-db	102,15,56,0,212
-	pxor	xmm2,xmm3
-	movdqa	xmm3,[80+esi]
-db	102,15,56,0,217
-	pxor	xmm3,xmm2
-db	102,15,56,0,221
-	movdqa	xmm2,[96+esi]
-db	102,15,56,0,212
-	pxor	xmm2,xmm3
-	movdqa	xmm3,[112+esi]
-db	102,15,56,0,217
-	pxor	xmm3,xmm2
-	add	edx,-16
-L$015schedule_mangle_both:
-	movdqa	xmm1,[256+ecx*1+ebp]
-db	102,15,56,0,217
-	add	ecx,-16
-	and	ecx,48
-	movdqu	[edx],xmm3
-	ret
-global	_vpaes_set_encrypt_key
-align	16
-_vpaes_set_encrypt_key:
-L$_vpaes_set_encrypt_key_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-%ifdef BORINGSSL_DISPATCH_TEST
-	push	ebx
-	push	edx
-	call	L$016pic
-L$016pic:
-	pop	ebx
-	lea	ebx,[(_BORINGSSL_function_hit+5-L$016pic)+ebx]
-	mov	edx,1
-	mov	BYTE [ebx],dl
-	pop	edx
-	pop	ebx
-%endif
-	mov	esi,DWORD [20+esp]
-	lea	ebx,[esp-56]
-	mov	eax,DWORD [24+esp]
-	and	ebx,-16
-	mov	edx,DWORD [28+esp]
-	xchg	ebx,esp
-	mov	DWORD [48+esp],ebx
-	mov	ebx,eax
-	shr	ebx,5
-	add	ebx,5
-	mov	DWORD [240+edx],ebx
-	mov	ecx,48
-	mov	edi,0
-	lea	ebp,[(L$_vpaes_consts+0x30-L$017pic_point)]
-	call	__vpaes_schedule_core
-L$017pic_point:
-	mov	esp,DWORD [48+esp]
-	xor	eax,eax
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_vpaes_set_decrypt_key
-align	16
-_vpaes_set_decrypt_key:
-L$_vpaes_set_decrypt_key_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [20+esp]
-	lea	ebx,[esp-56]
-	mov	eax,DWORD [24+esp]
-	and	ebx,-16
-	mov	edx,DWORD [28+esp]
-	xchg	ebx,esp
-	mov	DWORD [48+esp],ebx
-	mov	ebx,eax
-	shr	ebx,5
-	add	ebx,5
-	mov	DWORD [240+edx],ebx
-	shl	ebx,4
-	lea	edx,[16+ebx*1+edx]
-	mov	edi,1
-	mov	ecx,eax
-	shr	ecx,1
-	and	ecx,32
-	xor	ecx,32
-	lea	ebp,[(L$_vpaes_consts+0x30-L$018pic_point)]
-	call	__vpaes_schedule_core
-L$018pic_point:
-	mov	esp,DWORD [48+esp]
-	xor	eax,eax
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_vpaes_encrypt
-align	16
-_vpaes_encrypt:
-L$_vpaes_encrypt_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-%ifdef BORINGSSL_DISPATCH_TEST
-	push	ebx
-	push	edx
-	call	L$019pic
-L$019pic:
-	pop	ebx
-	lea	ebx,[(_BORINGSSL_function_hit+4-L$019pic)+ebx]
-	mov	edx,1
-	mov	BYTE [ebx],dl
-	pop	edx
-	pop	ebx
-%endif
-	lea	ebp,[(L$_vpaes_consts+0x30-L$020pic_point)]
-	call	__vpaes_preheat
-L$020pic_point:
-	mov	esi,DWORD [20+esp]
-	lea	ebx,[esp-56]
-	mov	edi,DWORD [24+esp]
-	and	ebx,-16
-	mov	edx,DWORD [28+esp]
-	xchg	ebx,esp
-	mov	DWORD [48+esp],ebx
-	movdqu	xmm0,[esi]
-	call	__vpaes_encrypt_core
-	movdqu	[edi],xmm0
-	mov	esp,DWORD [48+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_vpaes_decrypt
-align	16
-_vpaes_decrypt:
-L$_vpaes_decrypt_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	lea	ebp,[(L$_vpaes_consts+0x30-L$021pic_point)]
-	call	__vpaes_preheat
-L$021pic_point:
-	mov	esi,DWORD [20+esp]
-	lea	ebx,[esp-56]
-	mov	edi,DWORD [24+esp]
-	and	ebx,-16
-	mov	edx,DWORD [28+esp]
-	xchg	ebx,esp
-	mov	DWORD [48+esp],ebx
-	movdqu	xmm0,[esi]
-	call	__vpaes_decrypt_core
-	movdqu	[edi],xmm0
-	mov	esp,DWORD [48+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_vpaes_cbc_encrypt
-align	16
-_vpaes_cbc_encrypt:
-L$_vpaes_cbc_encrypt_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	eax,DWORD [28+esp]
-	mov	edx,DWORD [32+esp]
-	sub	eax,16
-	jc	NEAR L$022cbc_abort
-	lea	ebx,[esp-56]
-	mov	ebp,DWORD [36+esp]
-	and	ebx,-16
-	mov	ecx,DWORD [40+esp]
-	xchg	ebx,esp
-	movdqu	xmm1,[ebp]
-	sub	edi,esi
-	mov	DWORD [48+esp],ebx
-	mov	DWORD [esp],edi
-	mov	DWORD [4+esp],edx
-	mov	DWORD [8+esp],ebp
-	mov	edi,eax
-	lea	ebp,[(L$_vpaes_consts+0x30-L$023pic_point)]
-	call	__vpaes_preheat
-L$023pic_point:
-	cmp	ecx,0
-	je	NEAR L$024cbc_dec_loop
-	jmp	NEAR L$025cbc_enc_loop
-align	16
-L$025cbc_enc_loop:
-	movdqu	xmm0,[esi]
-	pxor	xmm0,xmm1
-	call	__vpaes_encrypt_core
-	mov	ebx,DWORD [esp]
-	mov	edx,DWORD [4+esp]
-	movdqa	xmm1,xmm0
-	movdqu	[esi*1+ebx],xmm0
-	lea	esi,[16+esi]
-	sub	edi,16
-	jnc	NEAR L$025cbc_enc_loop
-	jmp	NEAR L$026cbc_done
-align	16
-L$024cbc_dec_loop:
-	movdqu	xmm0,[esi]
-	movdqa	[16+esp],xmm1
-	movdqa	[32+esp],xmm0
-	call	__vpaes_decrypt_core
-	mov	ebx,DWORD [esp]
-	mov	edx,DWORD [4+esp]
-	pxor	xmm0,[16+esp]
-	movdqa	xmm1,[32+esp]
-	movdqu	[esi*1+ebx],xmm0
-	lea	esi,[16+esi]
-	sub	edi,16
-	jnc	NEAR L$024cbc_dec_loop
-L$026cbc_done:
-	mov	ebx,DWORD [8+esp]
-	mov	esp,DWORD [48+esp]
-	movdqu	[ebx],xmm1
-L$022cbc_abort:
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-linux.linux.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-linux.linux.x86.S
deleted file mode 100644
index 1da96a46..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-linux.linux.x86.S
+++ /dev/null
@@ -1,489 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <CCryptoBoringSSL_asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-.text
-.globl	bn_mul_mont
-.hidden	bn_mul_mont
-.type	bn_mul_mont,@function
-.align	16
-bn_mul_mont:
-.L_bn_mul_mont_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	xorl	%eax,%eax
-	movl	40(%esp),%edi
-	cmpl	$4,%edi
-	jl	.L000just_leave
-	leal	20(%esp),%esi
-	leal	24(%esp),%edx
-	addl	$2,%edi
-	negl	%edi
-	leal	-32(%esp,%edi,4),%ebp
-	negl	%edi
-	movl	%ebp,%eax
-	subl	%edx,%eax
-	andl	$2047,%eax
-	subl	%eax,%ebp
-	xorl	%ebp,%edx
-	andl	$2048,%edx
-	xorl	$2048,%edx
-	subl	%edx,%ebp
-	andl	$-64,%ebp
-	movl	%esp,%eax
-	subl	%ebp,%eax
-	andl	$-4096,%eax
-	movl	%esp,%edx
-	leal	(%ebp,%eax,1),%esp
-	movl	(%esp),%eax
-	cmpl	%ebp,%esp
-	ja	.L001page_walk
-	jmp	.L002page_walk_done
-.align	16
-.L001page_walk:
-	leal	-4096(%esp),%esp
-	movl	(%esp),%eax
-	cmpl	%ebp,%esp
-	ja	.L001page_walk
-.L002page_walk_done:
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%ebp
-	movl	16(%esi),%esi
-	movl	(%esi),%esi
-	movl	%eax,4(%esp)
-	movl	%ebx,8(%esp)
-	movl	%ecx,12(%esp)
-	movl	%ebp,16(%esp)
-	movl	%esi,20(%esp)
-	leal	-3(%edi),%ebx
-	movl	%edx,24(%esp)
-	call	.L003PIC_me_up
-.L003PIC_me_up:
-	popl	%eax
-	leal	OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	.L004non_sse2
-	movl	$-1,%eax
-	movd	%eax,%mm7
-	movl	8(%esp),%esi
-	movl	12(%esp),%edi
-	movl	16(%esp),%ebp
-	xorl	%edx,%edx
-	xorl	%ecx,%ecx
-	movd	(%edi),%mm4
-	movd	(%esi),%mm5
-	movd	(%ebp),%mm3
-	pmuludq	%mm4,%mm5
-	movq	%mm5,%mm2
-	movq	%mm5,%mm0
-	pand	%mm7,%mm0
-	pmuludq	20(%esp),%mm5
-	pmuludq	%mm5,%mm3
-	paddq	%mm0,%mm3
-	movd	4(%ebp),%mm1
-	movd	4(%esi),%mm0
-	psrlq	$32,%mm2
-	psrlq	$32,%mm3
-	incl	%ecx
-.align	16
-.L0051st:
-	pmuludq	%mm4,%mm0
-	pmuludq	%mm5,%mm1
-	paddq	%mm0,%mm2
-	paddq	%mm1,%mm3
-	movq	%mm2,%mm0
-	pand	%mm7,%mm0
-	movd	4(%ebp,%ecx,4),%mm1
-	paddq	%mm0,%mm3
-	movd	4(%esi,%ecx,4),%mm0
-	psrlq	$32,%mm2
-	movd	%mm3,28(%esp,%ecx,4)
-	psrlq	$32,%mm3
-	leal	1(%ecx),%ecx
-	cmpl	%ebx,%ecx
-	jl	.L0051st
-	pmuludq	%mm4,%mm0
-	pmuludq	%mm5,%mm1
-	paddq	%mm0,%mm2
-	paddq	%mm1,%mm3
-	movq	%mm2,%mm0
-	pand	%mm7,%mm0
-	paddq	%mm0,%mm3
-	movd	%mm3,28(%esp,%ecx,4)
-	psrlq	$32,%mm2
-	psrlq	$32,%mm3
-	paddq	%mm2,%mm3
-	movq	%mm3,32(%esp,%ebx,4)
-	incl	%edx
-.L006outer:
-	xorl	%ecx,%ecx
-	movd	(%edi,%edx,4),%mm4
-	movd	(%esi),%mm5
-	movd	32(%esp),%mm6
-	movd	(%ebp),%mm3
-	pmuludq	%mm4,%mm5
-	paddq	%mm6,%mm5
-	movq	%mm5,%mm0
-	movq	%mm5,%mm2
-	pand	%mm7,%mm0
-	pmuludq	20(%esp),%mm5
-	pmuludq	%mm5,%mm3
-	paddq	%mm0,%mm3
-	movd	36(%esp),%mm6
-	movd	4(%ebp),%mm1
-	movd	4(%esi),%mm0
-	psrlq	$32,%mm2
-	psrlq	$32,%mm3
-	paddq	%mm6,%mm2
-	incl	%ecx
-	decl	%ebx
-.L007inner:
-	pmuludq	%mm4,%mm0
-	pmuludq	%mm5,%mm1
-	paddq	%mm0,%mm2
-	paddq	%mm1,%mm3
-	movq	%mm2,%mm0
-	movd	36(%esp,%ecx,4),%mm6
-	pand	%mm7,%mm0
-	movd	4(%ebp,%ecx,4),%mm1
-	paddq	%mm0,%mm3
-	movd	4(%esi,%ecx,4),%mm0
-	psrlq	$32,%mm2
-	movd	%mm3,28(%esp,%ecx,4)
-	psrlq	$32,%mm3
-	paddq	%mm6,%mm2
-	decl	%ebx
-	leal	1(%ecx),%ecx
-	jnz	.L007inner
-	movl	%ecx,%ebx
-	pmuludq	%mm4,%mm0
-	pmuludq	%mm5,%mm1
-	paddq	%mm0,%mm2
-	paddq	%mm1,%mm3
-	movq	%mm2,%mm0
-	pand	%mm7,%mm0
-	paddq	%mm0,%mm3
-	movd	%mm3,28(%esp,%ecx,4)
-	psrlq	$32,%mm2
-	psrlq	$32,%mm3
-	movd	36(%esp,%ebx,4),%mm6
-	paddq	%mm2,%mm3
-	paddq	%mm6,%mm3
-	movq	%mm3,32(%esp,%ebx,4)
-	leal	1(%edx),%edx
-	cmpl	%ebx,%edx
-	jle	.L006outer
-	emms
-	jmp	.L008common_tail
-.align	16
-.L004non_sse2:
-	movl	8(%esp),%esi
-	leal	1(%ebx),%ebp
-	movl	12(%esp),%edi
-	xorl	%ecx,%ecx
-	movl	%esi,%edx
-	andl	$1,%ebp
-	subl	%edi,%edx
-	leal	4(%edi,%ebx,4),%eax
-	orl	%edx,%ebp
-	movl	(%edi),%edi
-	jz	.L009bn_sqr_mont
-	movl	%eax,28(%esp)
-	movl	(%esi),%eax
-	xorl	%edx,%edx
-.align	16
-.L010mull:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%eax,%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	movl	(%esi,%ecx,4),%eax
-	cmpl	%ebx,%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L010mull
-	movl	%edx,%ebp
-	mull	%edi
-	movl	20(%esp),%edi
-	addl	%ebp,%eax
-	movl	16(%esp),%esi
-	adcl	$0,%edx
-	imull	32(%esp),%edi
-	movl	%eax,32(%esp,%ebx,4)
-	xorl	%ecx,%ecx
-	movl	%edx,36(%esp,%ebx,4)
-	movl	%ecx,40(%esp,%ebx,4)
-	movl	(%esi),%eax
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	4(%esi),%eax
-	adcl	$0,%edx
-	incl	%ecx
-	jmp	.L0112ndmadd
-.align	16
-.L0121stmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L0121stmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%eax
-	movl	20(%esp),%edi
-	adcl	$0,%edx
-	movl	16(%esp),%esi
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	imull	32(%esp),%edi
-	xorl	%ecx,%ecx
-	addl	36(%esp,%ebx,4),%edx
-	movl	%ebp,32(%esp,%ebx,4)
-	adcl	$0,%ecx
-	movl	(%esi),%eax
-	movl	%edx,36(%esp,%ebx,4)
-	movl	%ecx,40(%esp,%ebx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	4(%esi),%eax
-	adcl	$0,%edx
-	movl	$1,%ecx
-.align	16
-.L0112ndmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0112ndmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ebx,4)
-	xorl	%eax,%eax
-	movl	12(%esp),%ecx
-	addl	36(%esp,%ebx,4),%edx
-	adcl	40(%esp,%ebx,4),%eax
-	leal	4(%ecx),%ecx
-	movl	%edx,32(%esp,%ebx,4)
-	cmpl	28(%esp),%ecx
-	movl	%eax,36(%esp,%ebx,4)
-	je	.L008common_tail
-	movl	(%ecx),%edi
-	movl	8(%esp),%esi
-	movl	%ecx,12(%esp)
-	xorl	%ecx,%ecx
-	xorl	%edx,%edx
-	movl	(%esi),%eax
-	jmp	.L0121stmadd
-.align	16
-.L009bn_sqr_mont:
-	movl	%ebx,(%esp)
-	movl	%ecx,12(%esp)
-	movl	%edi,%eax
-	mull	%edi
-	movl	%eax,32(%esp)
-	movl	%edx,%ebx
-	shrl	$1,%edx
-	andl	$1,%ebx
-	incl	%ecx
-.align	16
-.L013sqr:
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	leal	(%ebx,%eax,2),%ebp
-	shrl	$31,%eax
-	cmpl	(%esp),%ecx
-	movl	%eax,%ebx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L013sqr
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	movl	20(%esp),%edi
-	adcl	$0,%edx
-	movl	16(%esp),%esi
-	leal	(%ebx,%eax,2),%ebp
-	imull	32(%esp),%edi
-	shrl	$31,%eax
-	movl	%ebp,32(%esp,%ecx,4)
-	leal	(%eax,%edx,2),%ebp
-	movl	(%esi),%eax
-	shrl	$31,%edx
-	movl	%ebp,36(%esp,%ecx,4)
-	movl	%edx,40(%esp,%ecx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	%ecx,%ebx
-	adcl	$0,%edx
-	movl	4(%esi),%eax
-	movl	$1,%ecx
-.align	16
-.L0143rdmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	4(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ecx,4)
-	movl	%edx,%ebp
-	mull	%edi
-	addl	36(%esp,%ecx,4),%ebp
-	leal	2(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0143rdmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ebx,4)
-	movl	12(%esp),%ecx
-	xorl	%eax,%eax
-	movl	8(%esp),%esi
-	addl	36(%esp,%ebx,4),%edx
-	adcl	40(%esp,%ebx,4),%eax
-	movl	%edx,32(%esp,%ebx,4)
-	cmpl	%ebx,%ecx
-	movl	%eax,36(%esp,%ebx,4)
-	je	.L008common_tail
-	movl	4(%esi,%ecx,4),%edi
-	leal	1(%ecx),%ecx
-	movl	%edi,%eax
-	movl	%ecx,12(%esp)
-	mull	%edi
-	addl	32(%esp,%ecx,4),%eax
-	adcl	$0,%edx
-	movl	%eax,32(%esp,%ecx,4)
-	xorl	%ebp,%ebp
-	cmpl	%ebx,%ecx
-	leal	1(%ecx),%ecx
-	je	.L015sqrlast
-	movl	%edx,%ebx
-	shrl	$1,%edx
-	andl	$1,%ebx
-.align	16
-.L016sqradd:
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	leal	(%eax,%eax,1),%ebp
-	adcl	$0,%edx
-	shrl	$31,%eax
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%eax
-	addl	%ebx,%ebp
-	adcl	$0,%eax
-	cmpl	(%esp),%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	movl	%eax,%ebx
-	jle	.L016sqradd
-	movl	%edx,%ebp
-	addl	%edx,%edx
-	shrl	$31,%ebp
-	addl	%ebx,%edx
-	adcl	$0,%ebp
-.L015sqrlast:
-	movl	20(%esp),%edi
-	movl	16(%esp),%esi
-	imull	32(%esp),%edi
-	addl	32(%esp,%ecx,4),%edx
-	movl	(%esi),%eax
-	adcl	$0,%ebp
-	movl	%edx,32(%esp,%ecx,4)
-	movl	%ebp,36(%esp,%ecx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	leal	-1(%ecx),%ebx
-	adcl	$0,%edx
-	movl	$1,%ecx
-	movl	4(%esi),%eax
-	jmp	.L0143rdmadd
-.align	16
-.L008common_tail:
-	movl	16(%esp),%ebp
-	movl	4(%esp),%edi
-	leal	32(%esp),%esi
-	movl	(%esi),%eax
-	movl	%ebx,%ecx
-	xorl	%edx,%edx
-.align	16
-.L017sub:
-	sbbl	(%ebp,%edx,4),%eax
-	movl	%eax,(%edi,%edx,4)
-	decl	%ecx
-	movl	4(%esi,%edx,4),%eax
-	leal	1(%edx),%edx
-	jge	.L017sub
-	sbbl	$0,%eax
-	movl	$-1,%edx
-	xorl	%eax,%edx
-	jmp	.L018copy
-.align	16
-.L018copy:
-	movl	32(%esp,%ebx,4),%esi
-	movl	(%edi,%ebx,4),%ebp
-	movl	%ecx,32(%esp,%ebx,4)
-	andl	%eax,%esi
-	andl	%edx,%ebp
-	orl	%esi,%ebp
-	movl	%ebp,(%edi,%ebx,4)
-	decl	%ebx
-	jge	.L018copy
-	movl	24(%esp),%esp
-	movl	$1,%eax
-.L000just_leave:
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.size	bn_mul_mont,.-.L_bn_mul_mont_begin
-.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
-.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
-.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
-.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
-.byte	111,114,103,62,0
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-windows.windows.x86.S b/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-windows.windows.x86.S
deleted file mode 100644
index c6eba63c..00000000
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86-mont-windows.windows.x86.S
+++ /dev/null
@@ -1,497 +0,0 @@
-#define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(_WIN32)
-; This file is generated from a similarly-named Perl script in the BoringSSL
-; source tree. Do not edit by hand.
-
-%ifdef BORINGSSL_PREFIX
-%include "boringssl_prefix_symbols_nasm.inc"
-%endif
-%ifidn __OUTPUT_FORMAT__, win32
-%ifidn __OUTPUT_FORMAT__,obj
-section	code	use32 class=code align=64
-%elifidn __OUTPUT_FORMAT__,win32
-$@feat.00 equ 1
-section	.text	code align=64
-%else
-section	.text	code
-%endif
-;extern	_OPENSSL_ia32cap_P
-global	_bn_mul_mont
-align	16
-_bn_mul_mont:
-L$_bn_mul_mont_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	xor	eax,eax
-	mov	edi,DWORD [40+esp]
-	cmp	edi,4
-	jl	NEAR L$000just_leave
-	lea	esi,[20+esp]
-	lea	edx,[24+esp]
-	add	edi,2
-	neg	edi
-	lea	ebp,[edi*4+esp-32]
-	neg	edi
-	mov	eax,ebp
-	sub	eax,edx
-	and	eax,2047
-	sub	ebp,eax
-	xor	edx,ebp
-	and	edx,2048
-	xor	edx,2048
-	sub	ebp,edx
-	and	ebp,-64
-	mov	eax,esp
-	sub	eax,ebp
-	and	eax,-4096
-	mov	edx,esp
-	lea	esp,[eax*1+ebp]
-	mov	eax,DWORD [esp]
-	cmp	esp,ebp
-	ja	NEAR L$001page_walk
-	jmp	NEAR L$002page_walk_done
-align	16
-L$001page_walk:
-	lea	esp,[esp-4096]
-	mov	eax,DWORD [esp]
-	cmp	esp,ebp
-	ja	NEAR L$001page_walk
-L$002page_walk_done:
-	mov	eax,DWORD [esi]
-	mov	ebx,DWORD [4+esi]
-	mov	ecx,DWORD [8+esi]
-	mov	ebp,DWORD [12+esi]
-	mov	esi,DWORD [16+esi]
-	mov	esi,DWORD [esi]
-	mov	DWORD [4+esp],eax
-	mov	DWORD [8+esp],ebx
-	mov	DWORD [12+esp],ecx
-	mov	DWORD [16+esp],ebp
-	mov	DWORD [20+esp],esi
-	lea	ebx,[edi-3]
-	mov	DWORD [24+esp],edx
-	lea	eax,[_OPENSSL_ia32cap_P]
-	bt	DWORD [eax],26
-	jnc	NEAR L$003non_sse2
-	mov	eax,-1
-	movd	mm7,eax
-	mov	esi,DWORD [8+esp]
-	mov	edi,DWORD [12+esp]
-	mov	ebp,DWORD [16+esp]
-	xor	edx,edx
-	xor	ecx,ecx
-	movd	mm4,DWORD [edi]
-	movd	mm5,DWORD [esi]
-	movd	mm3,DWORD [ebp]
-	pmuludq	mm5,mm4
-	movq	mm2,mm5
-	movq	mm0,mm5
-	pand	mm0,mm7
-	pmuludq	mm5,[20+esp]
-	pmuludq	mm3,mm5
-	paddq	mm3,mm0
-	movd	mm1,DWORD [4+ebp]
-	movd	mm0,DWORD [4+esi]
-	psrlq	mm2,32
-	psrlq	mm3,32
-	inc	ecx
-align	16
-L$0041st:
-	pmuludq	mm0,mm4
-	pmuludq	mm1,mm5
-	paddq	mm2,mm0
-	paddq	mm3,mm1
-	movq	mm0,mm2
-	pand	mm0,mm7
-	movd	mm1,DWORD [4+ecx*4+ebp]
-	paddq	mm3,mm0
-	movd	mm0,DWORD [4+ecx*4+esi]
-	psrlq	mm2,32
-	movd	DWORD [28+ecx*4+esp],mm3
-	psrlq	mm3,32
-	lea	ecx,[1+ecx]
-	cmp	ecx,ebx
-	jl	NEAR L$0041st
-	pmuludq	mm0,mm4
-	pmuludq	mm1,mm5
-	paddq	mm2,mm0
-	paddq	mm3,mm1
-	movq	mm0,mm2
-	pand	mm0,mm7
-	paddq	mm3,mm0
-	movd	DWORD [28+ecx*4+esp],mm3
-	psrlq	mm2,32
-	psrlq	mm3,32
-	paddq	mm3,mm2
-	movq	[32+ebx*4+esp],mm3
-	inc	edx
-L$005outer:
-	xor	ecx,ecx
-	movd	mm4,DWORD [edx*4+edi]
-	movd	mm5,DWORD [esi]
-	movd	mm6,DWORD [32+esp]
-	movd	mm3,DWORD [ebp]
-	pmuludq	mm5,mm4
-	paddq	mm5,mm6
-	movq	mm0,mm5
-	movq	mm2,mm5
-	pand	mm0,mm7
-	pmuludq	mm5,[20+esp]
-	pmuludq	mm3,mm5
-	paddq	mm3,mm0
-	movd	mm6,DWORD [36+esp]
-	movd	mm1,DWORD [4+ebp]
-	movd	mm0,DWORD [4+esi]
-	psrlq	mm2,32
-	psrlq	mm3,32
-	paddq	mm2,mm6
-	inc	ecx
-	dec	ebx
-L$006inner:
-	pmuludq	mm0,mm4
-	pmuludq	mm1,mm5
-	paddq	mm2,mm0
-	paddq	mm3,mm1
-	movq	mm0,mm2
-	movd	mm6,DWORD [36+ecx*4+esp]
-	pand	mm0,mm7
-	movd	mm1,DWORD [4+ecx*4+ebp]
-	paddq	mm3,mm0
-	movd	mm0,DWORD [4+ecx*4+esi]
-	psrlq	mm2,32
-	movd	DWORD [28+ecx*4+esp],mm3
-	psrlq	mm3,32
-	paddq	mm2,mm6
-	dec	ebx
-	lea	ecx,[1+ecx]
-	jnz	NEAR L$006inner
-	mov	ebx,ecx
-	pmuludq	mm0,mm4
-	pmuludq	mm1,mm5
-	paddq	mm2,mm0
-	paddq	mm3,mm1
-	movq	mm0,mm2
-	pand	mm0,mm7
-	paddq	mm3,mm0
-	movd	DWORD [28+ecx*4+esp],mm3
-	psrlq	mm2,32
-	psrlq	mm3,32
-	movd	mm6,DWORD [36+ebx*4+esp]
-	paddq	mm3,mm2
-	paddq	mm3,mm6
-	movq	[32+ebx*4+esp],mm3
-	lea	edx,[1+edx]
-	cmp	edx,ebx
-	jle	NEAR L$005outer
-	emms
-	jmp	NEAR L$007common_tail
-align	16
-L$003non_sse2:
-	mov	esi,DWORD [8+esp]
-	lea	ebp,[1+ebx]
-	mov	edi,DWORD [12+esp]
-	xor	ecx,ecx
-	mov	edx,esi
-	and	ebp,1
-	sub	edx,edi
-	lea	eax,[4+ebx*4+edi]
-	or	ebp,edx
-	mov	edi,DWORD [edi]
-	jz	NEAR L$008bn_sqr_mont
-	mov	DWORD [28+esp],eax
-	mov	eax,DWORD [esi]
-	xor	edx,edx
-align	16
-L$009mull:
-	mov	ebp,edx
-	mul	edi
-	add	ebp,eax
-	lea	ecx,[1+ecx]
-	adc	edx,0
-	mov	eax,DWORD [ecx*4+esi]
-	cmp	ecx,ebx
-	mov	DWORD [28+ecx*4+esp],ebp
-	jl	NEAR L$009mull
-	mov	ebp,edx
-	mul	edi
-	mov	edi,DWORD [20+esp]
-	add	eax,ebp
-	mov	esi,DWORD [16+esp]
-	adc	edx,0
-	imul	edi,DWORD [32+esp]
-	mov	DWORD [32+ebx*4+esp],eax
-	xor	ecx,ecx
-	mov	DWORD [36+ebx*4+esp],edx
-	mov	DWORD [40+ebx*4+esp],ecx
-	mov	eax,DWORD [esi]
-	mul	edi
-	add	eax,DWORD [32+esp]
-	mov	eax,DWORD [4+esi]
-	adc	edx,0
-	inc	ecx
-	jmp	NEAR L$0102ndmadd
-align	16
-L$0111stmadd:
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [32+ecx*4+esp]
-	lea	ecx,[1+ecx]
-	adc	edx,0
-	add	ebp,eax
-	mov	eax,DWORD [ecx*4+esi]
-	adc	edx,0
-	cmp	ecx,ebx
-	mov	DWORD [28+ecx*4+esp],ebp
-	jl	NEAR L$0111stmadd
-	mov	ebp,edx
-	mul	edi
-	add	eax,DWORD [32+ebx*4+esp]
-	mov	edi,DWORD [20+esp]
-	adc	edx,0
-	mov	esi,DWORD [16+esp]
-	add	ebp,eax
-	adc	edx,0
-	imul	edi,DWORD [32+esp]
-	xor	ecx,ecx
-	add	edx,DWORD [36+ebx*4+esp]
-	mov	DWORD [32+ebx*4+esp],ebp
-	adc	ecx,0
-	mov	eax,DWORD [esi]
-	mov	DWORD [36+ebx*4+esp],edx
-	mov	DWORD [40+ebx*4+esp],ecx
-	mul	edi
-	add	eax,DWORD [32+esp]
-	mov	eax,DWORD [4+esi]
-	adc	edx,0
-	mov	ecx,1
-align	16
-L$0102ndmadd:
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [32+ecx*4+esp]
-	lea	ecx,[1+ecx]
-	adc	edx,0
-	add	ebp,eax
-	mov	eax,DWORD [ecx*4+esi]
-	adc	edx,0
-	cmp	ecx,ebx
-	mov	DWORD [24+ecx*4+esp],ebp
-	jl	NEAR L$0102ndmadd
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [32+ebx*4+esp]
-	adc	edx,0
-	add	ebp,eax
-	adc	edx,0
-	mov	DWORD [28+ebx*4+esp],ebp
-	xor	eax,eax
-	mov	ecx,DWORD [12+esp]
-	add	edx,DWORD [36+ebx*4+esp]
-	adc	eax,DWORD [40+ebx*4+esp]
-	lea	ecx,[4+ecx]
-	mov	DWORD [32+ebx*4+esp],edx
-	cmp	ecx,DWORD [28+esp]
-	mov	DWORD [36+ebx*4+esp],eax
-	je	NEAR L$007common_tail
-	mov	edi,DWORD [ecx]
-	mov	esi,DWORD [8+esp]
-	mov	DWORD [12+esp],ecx
-	xor	ecx,ecx
-	xor	edx,edx
-	mov	eax,DWORD [esi]
-	jmp	NEAR L$0111stmadd
-align	16
-L$008bn_sqr_mont:
-	mov	DWORD [esp],ebx
-	mov	DWORD [12+esp],ecx
-	mov	eax,edi
-	mul	edi
-	mov	DWORD [32+esp],eax
-	mov	ebx,edx
-	shr	edx,1
-	and	ebx,1
-	inc	ecx
-align	16
-L$012sqr:
-	mov	eax,DWORD [ecx*4+esi]
-	mov	ebp,edx
-	mul	edi
-	add	eax,ebp
-	lea	ecx,[1+ecx]
-	adc	edx,0
-	lea	ebp,[eax*2+ebx]
-	shr	eax,31
-	cmp	ecx,DWORD [esp]
-	mov	ebx,eax
-	mov	DWORD [28+ecx*4+esp],ebp
-	jl	NEAR L$012sqr
-	mov	eax,DWORD [ecx*4+esi]
-	mov	ebp,edx
-	mul	edi
-	add	eax,ebp
-	mov	edi,DWORD [20+esp]
-	adc	edx,0
-	mov	esi,DWORD [16+esp]
-	lea	ebp,[eax*2+ebx]
-	imul	edi,DWORD [32+esp]
-	shr	eax,31
-	mov	DWORD [32+ecx*4+esp],ebp
-	lea	ebp,[edx*2+eax]
-	mov	eax,DWORD [esi]
-	shr	edx,31
-	mov	DWORD [36+ecx*4+esp],ebp
-	mov	DWORD [40+ecx*4+esp],edx
-	mul	edi
-	add	eax,DWORD [32+esp]
-	mov	ebx,ecx
-	adc	edx,0
-	mov	eax,DWORD [4+esi]
-	mov	ecx,1
-align	16
-L$0133rdmadd:
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [32+ecx*4+esp]
-	adc	edx,0
-	add	ebp,eax
-	mov	eax,DWORD [4+ecx*4+esi]
-	adc	edx,0
-	mov	DWORD [28+ecx*4+esp],ebp
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [36+ecx*4+esp]
-	lea	ecx,[2+ecx]
-	adc	edx,0
-	add	ebp,eax
-	mov	eax,DWORD [ecx*4+esi]
-	adc	edx,0
-	cmp	ecx,ebx
-	mov	DWORD [24+ecx*4+esp],ebp
-	jl	NEAR L$0133rdmadd
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [32+ebx*4+esp]
-	adc	edx,0
-	add	ebp,eax
-	adc	edx,0
-	mov	DWORD [28+ebx*4+esp],ebp
-	mov	ecx,DWORD [12+esp]
-	xor	eax,eax
-	mov	esi,DWORD [8+esp]
-	add	edx,DWORD [36+ebx*4+esp]
-	adc	eax,DWORD [40+ebx*4+esp]
-	mov	DWORD [32+ebx*4+esp],edx
-	cmp	ecx,ebx
-	mov	DWORD [36+ebx*4+esp],eax
-	je	NEAR L$007common_tail
-	mov	edi,DWORD [4+ecx*4+esi]
-	lea	ecx,[1+ecx]
-	mov	eax,edi
-	mov	DWORD [12+esp],ecx
-	mul	edi
-	add	eax,DWORD [32+ecx*4+esp]
-	adc	edx,0
-	mov	DWORD [32+ecx*4+esp],eax
-	xor	ebp,ebp
-	cmp	ecx,ebx
-	lea	ecx,[1+ecx]
-	je	NEAR L$014sqrlast
-	mov	ebx,edx
-	shr	edx,1
-	and	ebx,1
-align	16
-L$015sqradd:
-	mov	eax,DWORD [ecx*4+esi]
-	mov	ebp,edx
-	mul	edi
-	add	eax,ebp
-	lea	ebp,[eax*1+eax]
-	adc	edx,0
-	shr	eax,31
-	add	ebp,DWORD [32+ecx*4+esp]
-	lea	ecx,[1+ecx]
-	adc	eax,0
-	add	ebp,ebx
-	adc	eax,0
-	cmp	ecx,DWORD [esp]
-	mov	DWORD [28+ecx*4+esp],ebp
-	mov	ebx,eax
-	jle	NEAR L$015sqradd
-	mov	ebp,edx
-	add	edx,edx
-	shr	ebp,31
-	add	edx,ebx
-	adc	ebp,0
-L$014sqrlast:
-	mov	edi,DWORD [20+esp]
-	mov	esi,DWORD [16+esp]
-	imul	edi,DWORD [32+esp]
-	add	edx,DWORD [32+ecx*4+esp]
-	mov	eax,DWORD [esi]
-	adc	ebp,0
-	mov	DWORD [32+ecx*4+esp],edx
-	mov	DWORD [36+ecx*4+esp],ebp
-	mul	edi
-	add	eax,DWORD [32+esp]
-	lea	ebx,[ecx-1]
-	adc	edx,0
-	mov	ecx,1
-	mov	eax,DWORD [4+esi]
-	jmp	NEAR L$0133rdmadd
-align	16
-L$007common_tail:
-	mov	ebp,DWORD [16+esp]
-	mov	edi,DWORD [4+esp]
-	lea	esi,[32+esp]
-	mov	eax,DWORD [esi]
-	mov	ecx,ebx
-	xor	edx,edx
-align	16
-L$016sub:
-	sbb	eax,DWORD [edx*4+ebp]
-	mov	DWORD [edx*4+edi],eax
-	dec	ecx
-	mov	eax,DWORD [4+edx*4+esi]
-	lea	edx,[1+edx]
-	jge	NEAR L$016sub
-	sbb	eax,0
-	mov	edx,-1
-	xor	edx,eax
-	jmp	NEAR L$017copy
-align	16
-L$017copy:
-	mov	esi,DWORD [32+ebx*4+esp]
-	mov	ebp,DWORD [ebx*4+edi]
-	mov	DWORD [32+ebx*4+esp],ecx
-	and	esi,eax
-	and	ebp,edx
-	or	ebp,esi
-	mov	DWORD [ebx*4+edi],ebp
-	dec	ebx
-	jge	NEAR L$017copy
-	mov	esp,DWORD [24+esp]
-	mov	eax,1
-L$000just_leave:
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-db	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
-db	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
-db	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
-db	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
-db	111,114,103,62,0
-segment	.bss
-common	_OPENSSL_ia32cap_P 16
-%else
-; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
-ret
-%endif
-#endif  // defined(__i386__) && defined(_WIN32)
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
diff --git a/Sources/CCryptoBoringSSL/crypto/hpke/hpke.c b/Sources/CCryptoBoringSSL/crypto/hpke/hpke.c
index 67ff51bb..0350ecdb 100644
--- a/Sources/CCryptoBoringSSL/crypto/hpke/hpke.c
+++ b/Sources/CCryptoBoringSSL/crypto/hpke/hpke.c
@@ -21,12 +21,15 @@
 #include <CCryptoBoringSSL_bytestring.h>
 #include <CCryptoBoringSSL_curve25519.h>
 #include <CCryptoBoringSSL_digest.h>
+#include <CCryptoBoringSSL_ec.h>
 #include <CCryptoBoringSSL_err.h>
 #include <CCryptoBoringSSL_evp_errors.h>
 #include <CCryptoBoringSSL_hkdf.h>
+#include <CCryptoBoringSSL_mem.h>
 #include <CCryptoBoringSSL_rand.h>
 #include <CCryptoBoringSSL_sha.h>
 
+#include "../fipsmodule/ec/internal.h"
 #include "../internal.h"
 
 
@@ -111,7 +114,7 @@ static int hpke_labeled_expand(const EVP_MD *hkdf_md, uint8_t *out_key,
                                const uint8_t *info, size_t info_len) {
   // labeledInfo = concat(I2OSP(L, 2), "HPKE-v1", suite_id, label, info)
   CBB labeled_info;
-  int ok = CBB_init(&labeled_info, 0) &&
+  int ok = CBB_init(&labeled_info, 0) &&  //
            CBB_add_u16(&labeled_info, out_len) &&
            add_label_string(&labeled_info, kHpkeVersionId) &&
            CBB_add_bytes(&labeled_info, suite_id, suite_id_len) &&
@@ -309,6 +312,294 @@ const EVP_HPKE_KEM *EVP_hpke_x25519_hkdf_sha256(void) {
   return &kKEM;
 }
 
+#define P256_PRIVATE_KEY_LEN 32
+#define P256_PUBLIC_KEY_LEN 65
+#define P256_PUBLIC_VALUE_LEN 65
+#define P256_SEED_LEN 32
+#define P256_SHARED_KEY_LEN 32
+
+static int p256_public_from_private(uint8_t out_pub[P256_PUBLIC_VALUE_LEN],
+                                    const uint8_t priv[P256_PRIVATE_KEY_LEN]) {
+  const EC_GROUP *const group = EC_group_p256();
+  const uint8_t kAllZeros[P256_PRIVATE_KEY_LEN] = {0};
+  EC_SCALAR private_scalar;
+  EC_JACOBIAN public_point;
+  EC_AFFINE public_point_affine;
+
+  if (CRYPTO_memcmp(kAllZeros, priv, sizeof(kAllZeros)) == 0) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
+    return 0;
+  }
+
+  if (!ec_scalar_from_bytes(group, &private_scalar, priv,
+                            P256_PRIVATE_KEY_LEN) ||
+      !ec_point_mul_scalar_base(group, &public_point, &private_scalar) ||
+      !ec_jacobian_to_affine(group, &public_point_affine, &public_point)) {
+    return 0;
+  }
+
+  size_t out_len_x, out_len_y;
+  out_pub[0] = POINT_CONVERSION_UNCOMPRESSED;
+  ec_felem_to_bytes(group, &out_pub[1], &out_len_x, &public_point_affine.X);
+  ec_felem_to_bytes(group, &out_pub[33], &out_len_y, &public_point_affine.Y);
+  return 1;
+}
+
+static int p256_init_key(EVP_HPKE_KEY *key, const uint8_t *priv_key,
+                         size_t priv_key_len) {
+  if (priv_key_len != P256_PRIVATE_KEY_LEN) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
+    return 0;
+  }
+
+  if (!p256_public_from_private(key->public_key, priv_key)) {
+    return 0;
+  }
+
+  OPENSSL_memcpy(key->private_key, priv_key, priv_key_len);
+  return 1;
+}
+
+static int p256_private_key_from_seed(uint8_t out_priv[P256_PRIVATE_KEY_LEN],
+                                      const uint8_t seed[P256_SEED_LEN]) {
+  // https://www.rfc-editor.org/rfc/rfc9180.html#name-derivekeypair
+  const uint8_t suite_id[5] = {'K', 'E', 'M',
+                               EVP_HPKE_DHKEM_P256_HKDF_SHA256 >> 8,
+                               EVP_HPKE_DHKEM_P256_HKDF_SHA256 & 0xff};
+
+  uint8_t dkp_prk[32];
+  size_t dkp_prk_len;
+  if (!hpke_labeled_extract(EVP_sha256(), dkp_prk, &dkp_prk_len, NULL, 0,
+                            suite_id, sizeof(suite_id), "dkp_prk", seed,
+                            P256_SEED_LEN)) {
+    return 0;
+  }
+  assert(dkp_prk_len == sizeof(dkp_prk));
+
+  const EC_GROUP *const group = EC_group_p256();
+  EC_SCALAR private_scalar;
+
+  for (unsigned counter = 0; counter < 256; counter++) {
+    const uint8_t counter_byte = counter & 0xff;
+    if (!hpke_labeled_expand(EVP_sha256(), out_priv, P256_PRIVATE_KEY_LEN,
+                             dkp_prk, sizeof(dkp_prk), suite_id,
+                             sizeof(suite_id), "candidate", &counter_byte,
+                             sizeof(counter_byte))) {
+      return 0;
+    }
+
+    // This checks that the scalar is less than the order.
+    if (ec_scalar_from_bytes(group, &private_scalar, out_priv,
+                             P256_PRIVATE_KEY_LEN)) {
+      return 1;
+    }
+  }
+
+  // This happens with probability of 2^-(32*256).
+  OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR);
+  return 0;
+}
+
+static int p256_generate_key(EVP_HPKE_KEY *key) {
+  uint8_t seed[P256_SEED_LEN];
+  RAND_bytes(seed, sizeof(seed));
+  if (!p256_private_key_from_seed(key->private_key, seed) ||
+      !p256_public_from_private(key->public_key, key->private_key)) {
+    return 0;
+  }
+  return 1;
+}
+
+static int p256(uint8_t out_dh[P256_SHARED_KEY_LEN],
+                const uint8_t my_private[P256_PRIVATE_KEY_LEN],
+                const uint8_t their_public[P256_PUBLIC_VALUE_LEN]) {
+  const EC_GROUP *const group = EC_group_p256();
+  EC_SCALAR private_scalar;
+  EC_FELEM x, y;
+  EC_JACOBIAN shared_point, their_point;
+  EC_AFFINE their_point_affine, shared_point_affine;
+
+  if (their_public[0] != POINT_CONVERSION_UNCOMPRESSED ||
+      !ec_felem_from_bytes(group, &x, &their_public[1], 32) ||
+      !ec_felem_from_bytes(group, &y, &their_public[33], 32) ||
+      !ec_point_set_affine_coordinates(group, &their_point_affine, &x, &y) ||
+      !ec_scalar_from_bytes(group, &private_scalar, my_private,
+                            P256_PRIVATE_KEY_LEN)) {
+    OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR);
+    return 0;
+  }
+
+  ec_affine_to_jacobian(group, &their_point, &their_point_affine);
+  if (!ec_point_mul_scalar(group, &shared_point, &their_point,
+                           &private_scalar) ||
+      !ec_jacobian_to_affine(group, &shared_point_affine, &shared_point)) {
+    OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR);
+    return 0;
+  }
+
+  size_t out_len;
+  ec_felem_to_bytes(group, out_dh, &out_len, &shared_point_affine.X);
+  assert(out_len == P256_SHARED_KEY_LEN);
+  return 1;
+}
+
+static int p256_encap_with_seed(const EVP_HPKE_KEM *kem,
+                                uint8_t *out_shared_secret,
+                                size_t *out_shared_secret_len, uint8_t *out_enc,
+                                size_t *out_enc_len, size_t max_enc,
+                                const uint8_t *peer_public_key,
+                                size_t peer_public_key_len, const uint8_t *seed,
+                                size_t seed_len) {
+  if (max_enc < P256_PUBLIC_VALUE_LEN) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE);
+    return 0;
+  }
+  if (seed_len != P256_SEED_LEN) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
+    return 0;
+  }
+  uint8_t private_key[P256_PRIVATE_KEY_LEN];
+  if (!p256_private_key_from_seed(private_key, seed)) {
+    return 0;
+  }
+  p256_public_from_private(out_enc, private_key);
+
+  uint8_t dh[P256_SHARED_KEY_LEN];
+  if (peer_public_key_len != P256_PUBLIC_VALUE_LEN ||
+      !p256(dh, private_key, peer_public_key)) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY);
+    return 0;
+  }
+
+  uint8_t kem_context[2 * P256_PUBLIC_VALUE_LEN];
+  OPENSSL_memcpy(kem_context, out_enc, P256_PUBLIC_VALUE_LEN);
+  OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, peer_public_key,
+                 P256_PUBLIC_VALUE_LEN);
+  if (!dhkem_extract_and_expand(kem->id, EVP_sha256(), out_shared_secret,
+                                SHA256_DIGEST_LENGTH, dh, sizeof(dh),
+                                kem_context, sizeof(kem_context))) {
+    return 0;
+  }
+
+  *out_enc_len = P256_PUBLIC_VALUE_LEN;
+  *out_shared_secret_len = SHA256_DIGEST_LENGTH;
+  return 1;
+}
+
+static int p256_decap(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret,
+                      size_t *out_shared_secret_len, const uint8_t *enc,
+                      size_t enc_len) {
+  uint8_t dh[P256_SHARED_KEY_LEN];
+  if (enc_len != P256_PUBLIC_VALUE_LEN ||  //
+      !p256(dh, key->private_key, enc)) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY);
+    return 0;
+  }
+
+  uint8_t kem_context[2 * P256_PUBLIC_VALUE_LEN];
+  OPENSSL_memcpy(kem_context, enc, P256_PUBLIC_VALUE_LEN);
+  OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, key->public_key,
+                 P256_PUBLIC_VALUE_LEN);
+  if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret,
+                                SHA256_DIGEST_LENGTH, dh, sizeof(dh),
+                                kem_context, sizeof(kem_context))) {
+    return 0;
+  }
+
+  *out_shared_secret_len = SHA256_DIGEST_LENGTH;
+  return 1;
+}
+
+static int p256_auth_encap_with_seed(
+    const EVP_HPKE_KEY *key, uint8_t *out_shared_secret,
+    size_t *out_shared_secret_len, uint8_t *out_enc, size_t *out_enc_len,
+    size_t max_enc, const uint8_t *peer_public_key, size_t peer_public_key_len,
+    const uint8_t *seed, size_t seed_len) {
+  if (max_enc < P256_PUBLIC_VALUE_LEN) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE);
+    return 0;
+  }
+  if (seed_len != P256_SEED_LEN) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR);
+    return 0;
+  }
+  uint8_t private_key[P256_PRIVATE_KEY_LEN];
+  if (!p256_private_key_from_seed(private_key, seed)) {
+    return 0;
+  }
+  p256_public_from_private(out_enc, private_key);
+
+  uint8_t dh[2 * P256_SHARED_KEY_LEN];
+  if (peer_public_key_len != P256_PUBLIC_VALUE_LEN ||
+      !p256(dh, private_key, peer_public_key) ||
+      !p256(dh + P256_SHARED_KEY_LEN, key->private_key, peer_public_key)) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY);
+    return 0;
+  }
+
+  uint8_t kem_context[3 * P256_PUBLIC_VALUE_LEN];
+  OPENSSL_memcpy(kem_context, out_enc, P256_PUBLIC_VALUE_LEN);
+  OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, peer_public_key,
+                 P256_PUBLIC_VALUE_LEN);
+  OPENSSL_memcpy(kem_context + 2 * P256_PUBLIC_VALUE_LEN, key->public_key,
+                 P256_PUBLIC_VALUE_LEN);
+  if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret,
+                                SHA256_DIGEST_LENGTH, dh, sizeof(dh),
+                                kem_context, sizeof(kem_context))) {
+    return 0;
+  }
+
+  *out_enc_len = P256_PUBLIC_VALUE_LEN;
+  *out_shared_secret_len = SHA256_DIGEST_LENGTH;
+  return 1;
+}
+
+static int p256_auth_decap(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret,
+                           size_t *out_shared_secret_len, const uint8_t *enc,
+                           size_t enc_len, const uint8_t *peer_public_key,
+                           size_t peer_public_key_len) {
+  uint8_t dh[2 * P256_SHARED_KEY_LEN];
+  if (enc_len != P256_PUBLIC_VALUE_LEN ||
+      peer_public_key_len != P256_PUBLIC_VALUE_LEN ||
+      !p256(dh, key->private_key, enc) ||
+      !p256(dh + P256_SHARED_KEY_LEN, key->private_key, peer_public_key)) {
+    OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY);
+    return 0;
+  }
+
+  uint8_t kem_context[3 * P256_PUBLIC_VALUE_LEN];
+  OPENSSL_memcpy(kem_context, enc, P256_PUBLIC_VALUE_LEN);
+  OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, key->public_key,
+                 P256_PUBLIC_VALUE_LEN);
+  OPENSSL_memcpy(kem_context + 2 * P256_PUBLIC_VALUE_LEN, peer_public_key,
+                 P256_PUBLIC_VALUE_LEN);
+  if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret,
+                                SHA256_DIGEST_LENGTH, dh, sizeof(dh),
+                                kem_context, sizeof(kem_context))) {
+    return 0;
+  }
+
+  *out_shared_secret_len = SHA256_DIGEST_LENGTH;
+  return 1;
+}
+
+const EVP_HPKE_KEM *EVP_hpke_p256_hkdf_sha256(void) {
+  static const EVP_HPKE_KEM kKEM = {
+      /*id=*/EVP_HPKE_DHKEM_P256_HKDF_SHA256,
+      /*public_key_len=*/P256_PUBLIC_KEY_LEN,
+      /*private_key_len=*/P256_PRIVATE_KEY_LEN,
+      /*seed_len=*/P256_SEED_LEN,
+      /*enc_len=*/P256_PUBLIC_VALUE_LEN,
+      p256_init_key,
+      p256_generate_key,
+      p256_encap_with_seed,
+      p256_decap,
+      p256_auth_encap_with_seed,
+      p256_auth_decap,
+  };
+  return &kKEM;
+}
+
 uint16_t EVP_HPKE_KEM_id(const EVP_HPKE_KEM *kem) { return kem->id; }
 
 size_t EVP_HPKE_KEM_public_key_len(const EVP_HPKE_KEM *kem) {
@@ -398,7 +689,7 @@ int EVP_HPKE_KEY_public_key(const EVP_HPKE_KEY *key, uint8_t *out,
 }
 
 int EVP_HPKE_KEY_private_key(const EVP_HPKE_KEY *key, uint8_t *out,
-                            size_t *out_len, size_t max_out) {
+                             size_t *out_len, size_t max_out) {
   if (max_out < key->kem->private_key_len) {
     OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE);
     return 0;
diff --git a/Sources/CCryptoBoringSSL/crypto/internal.h b/Sources/CCryptoBoringSSL/crypto/internal.h
index c1882f26..a742ef6b 100644
--- a/Sources/CCryptoBoringSSL/crypto/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/internal.h
@@ -180,17 +180,29 @@ extern "C" {
 #endif
 
 
-#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM) || \
-    defined(OPENSSL_AARCH64)
-// OPENSSL_cpuid_setup initializes the platform-specific feature cache.
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_STATIC_ARMCAP) && \
+    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) ||            \
+     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
+// x86, x86_64, and the ARMs need to record the result of a cpuid/getauxval call
+// for the asm to work correctly, unless compiled without asm code.
+#define NEED_CPUID
+
+// OPENSSL_cpuid_setup initializes the platform-specific feature cache. This
+// function should not be called directly. Call |OPENSSL_init_cpuid| instead.
 void OPENSSL_cpuid_setup(void);
+
+// OPENSSL_init_cpuid initializes the platform-specific feature cache, if
+// needed. This function is idempotent and may be called concurrently.
+void OPENSSL_init_cpuid(void);
+#else
+OPENSSL_INLINE void OPENSSL_init_cpuid(void) {}
 #endif
 
 #if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
     !defined(OPENSSL_STATIC_ARMCAP)
 // OPENSSL_get_armcap_pointer_for_test returns a pointer to |OPENSSL_armcap_P|
-// for unit tests. Any modifications to the value must be made after
-// |CRYPTO_library_init| but before any other function call in BoringSSL.
+// for unit tests. Any modifications to the value must be made before any other
+// function call in BoringSSL.
 OPENSSL_EXPORT uint32_t *OPENSSL_get_armcap_pointer_for_test(void);
 #endif
 
@@ -205,7 +217,9 @@ typedef __uint128_t uint128_t;
 // __uint128_t division depends on intrinsics in the compiler runtime. Those
 // intrinsics are missing in clang-cl (https://crbug.com/787617) and nanolibc.
 // These may be bugs in the toolchain definition, but just disable it for now.
-#if !defined(_MSC_VER) && !defined(OPENSSL_NANOLIBC)
+// EDK2's toolchain is missing __udivti3 (b/339380897) so cannot support
+// 128-bit division currently.
+#if !defined(_MSC_VER) && !defined(OPENSSL_NANOLIBC) && !defined(__EDK2_BORINGSSL__)
 #define BORINGSSL_CAN_DIVIDE_UINT128
 #endif
 #endif
@@ -260,9 +274,9 @@ typedef __uint128_t uint128_t;
 #endif
 
 #if defined(__GNUC__) || defined(__clang__)
-#define OPENSSL_ATTR_PURE __attribute__((pure))
+#define OPENSSL_ATTR_CONST __attribute__((const))
 #else
-#define OPENSSL_ATTR_PURE
+#define OPENSSL_ATTR_CONST
 #endif
 
 #if defined(BORINGSSL_MALLOC_FAILURE_TESTING)
@@ -555,11 +569,22 @@ static inline void constant_time_conditional_memcpy(void *dst, const void *src,
 // |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory
 // ranges at |dst| and |src| must not overlap, as when calling |memcpy|.
 static inline void constant_time_conditional_memxor(void *dst, const void *src,
-                                                    const size_t n,
+                                                    size_t n,
                                                     const crypto_word_t mask) {
   assert(!buffers_alias(dst, n, src, n));
   uint8_t *out = (uint8_t *)dst;
   const uint8_t *in = (const uint8_t *)src;
+#if defined(__GNUC__) && !defined(__clang__)
+  // gcc 13.2.0 doesn't automatically vectorize this loop regardless of barrier
+  typedef uint8_t v32u8 __attribute__((vector_size(32), aligned(1), may_alias));
+  size_t n_vec = n&~(size_t)31;
+  v32u8 masks = ((uint8_t)mask-(v32u8){}); // broadcast
+  for (size_t i = 0; i < n_vec; i += 32) {
+    *(v32u8*)&out[i] ^= masks & *(v32u8*)&in[i];
+  }
+  out += n_vec;
+  n -= n_vec;
+#endif
   for (size_t i = 0; i < n; i++) {
     out[i] ^= value_barrier_w(mask) & in[i];
   }
@@ -609,6 +634,12 @@ static inline int constant_time_declassify_int(int v) {
   return value_barrier_u32(v);
 }
 
+// declassify_assert behaves like |assert| but declassifies the result of
+// evaluating |expr|. This allows the assertion to branch on the (presumably
+// public) result, but still ensures that values leading up to the computation
+// were secret.
+#define declassify_assert(expr) assert(constant_time_declassify_int(expr))
+
 
 // Thread-safe initialisation.
 
@@ -1168,6 +1199,11 @@ static inline uint64_t CRYPTO_rotr_u64(uint64_t value, int shift) {
 
 // Arithmetic functions.
 
+// The most efficient versions of these functions on GCC and Clang depend on C11
+// |_Generic|. If we ever need to call these from C++, we'll need to add a
+// variant that uses C++ overloads instead.
+#if !defined(__cplusplus)
+
 // CRYPTO_addc_* returns |x + y + carry|, and sets |*out_carry| to the carry
 // bit. |carry| must be zero or one.
 #if OPENSSL_HAS_BUILTIN(__builtin_addc)
@@ -1180,13 +1216,13 @@ static inline uint64_t CRYPTO_rotr_u64(uint64_t value, int shift) {
 
 static inline uint32_t CRYPTO_addc_u32(uint32_t x, uint32_t y, uint32_t carry,
                                        uint32_t *out_carry) {
-  assert(carry <= 1);
+  declassify_assert(carry <= 1);
   return CRYPTO_GENERIC_ADDC(x, y, carry, out_carry);
 }
 
 static inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry,
                                        uint64_t *out_carry) {
-  assert(carry <= 1);
+  declassify_assert(carry <= 1);
   return CRYPTO_GENERIC_ADDC(x, y, carry, out_carry);
 }
 
@@ -1194,7 +1230,7 @@ static inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry,
 
 static inline uint32_t CRYPTO_addc_u32(uint32_t x, uint32_t y, uint32_t carry,
                                        uint32_t *out_carry) {
-  assert(carry <= 1);
+  declassify_assert(carry <= 1);
   uint64_t ret = carry;
   ret += (uint64_t)x + y;
   *out_carry = (uint32_t)(ret >> 32);
@@ -1203,7 +1239,7 @@ static inline uint32_t CRYPTO_addc_u32(uint32_t x, uint32_t y, uint32_t carry,
 
 static inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry,
                                        uint64_t *out_carry) {
-  assert(carry <= 1);
+  declassify_assert(carry <= 1);
 #if defined(BORINGSSL_HAS_UINT128)
   uint128_t ret = carry;
   ret += (uint128_t)x + y;
@@ -1232,13 +1268,13 @@ static inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry,
 
 static inline uint32_t CRYPTO_subc_u32(uint32_t x, uint32_t y, uint32_t borrow,
                                        uint32_t *out_borrow) {
-  assert(borrow <= 1);
+  declassify_assert(borrow <= 1);
   return CRYPTO_GENERIC_SUBC(x, y, borrow, out_borrow);
 }
 
 static inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow,
                                        uint64_t *out_borrow) {
-  assert(borrow <= 1);
+  declassify_assert(borrow <= 1);
   return CRYPTO_GENERIC_SUBC(x, y, borrow, out_borrow);
 }
 
@@ -1246,7 +1282,7 @@ static inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow,
 
 static inline uint32_t CRYPTO_subc_u32(uint32_t x, uint32_t y, uint32_t borrow,
                                        uint32_t *out_borrow) {
-  assert(borrow <= 1);
+  declassify_assert(borrow <= 1);
   uint32_t ret = x - y - borrow;
   *out_borrow = (x < y) | ((x == y) & borrow);
   return ret;
@@ -1254,7 +1290,7 @@ static inline uint32_t CRYPTO_subc_u32(uint32_t x, uint32_t y, uint32_t borrow,
 
 static inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow,
                                        uint64_t *out_borrow) {
-  assert(borrow <= 1);
+  declassify_assert(borrow <= 1);
   uint64_t ret = x - y - borrow;
   *out_borrow = (x < y) | ((x == y) & borrow);
   return ret;
@@ -1269,6 +1305,8 @@ static inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow,
 #define CRYPTO_subc_w CRYPTO_subc_u32
 #endif
 
+#endif  // !__cplusplus
+
 
 // FIPS functions.
 
@@ -1352,21 +1390,23 @@ OPENSSL_INLINE int boringssl_fips_break_test(const char *test) {
 //     ECX for CPUID where EAX = 1
 //     Bit 11 is used to indicate AMD XOP support, not SDBG
 //   Index 2:
-//     EBX for CPUID where EAX = 7
+//     EBX for CPUID where EAX = 7, ECX = 0
+//     Bit 14 (for removed feature MPX) is used to indicate a preference for ymm
+//       registers over zmm even when zmm registers are supported
 //   Index 3:
-//     ECX for CPUID where EAX = 7
+//     ECX for CPUID where EAX = 7, ECX = 0
 //
-// Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the YMM and XMM
-// bits in XCR0, so it is not necessary to check those. (WARNING: See caveats
-// in cpu_intel.c.)
+// Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the XMM, YMM,
+// and AVX512 bits in XCR0, so it is not necessary to check those. (WARNING: See
+// caveats in cpu_intel.c.)
 //
 // From C, this symbol should only be accessed with |OPENSSL_get_ia32cap|.
 extern uint32_t OPENSSL_ia32cap_P[4];
 
 // OPENSSL_get_ia32cap initializes the library if needed and returns the |idx|th
-// entry of |OPENSSL_ia32cap_P|. It is marked as a pure function so duplicate
+// entry of |OPENSSL_ia32cap_P|. It is marked as a const function so duplicate
 // calls can be merged by the compiler, at least when indices match.
-OPENSSL_ATTR_PURE uint32_t OPENSSL_get_ia32cap(int idx);
+OPENSSL_ATTR_CONST uint32_t OPENSSL_get_ia32cap(int idx);
 
 // See Intel manual, volume 2A, table 3-11.
 
@@ -1508,7 +1548,6 @@ OPENSSL_INLINE int CRYPTO_is_x86_SHA_capable(void) {
 // otherwise select. See chacha-x86_64.pl.
 //
 // Bonnell, Silvermont's predecessor in the Atom lineup, will also be matched by
-// this. |OPENSSL_cpuid_setup| forces Knights Landing to also be matched by
 // this. Goldmont (Silvermont's successor in the Atom lineup) added XSAVE so it
 // isn't matched by this. Various sources indicate AMD first implemented MOVBE
 // and XSAVE at the same time in Jaguar, so it seems like AMD chips will not be
@@ -1517,15 +1556,56 @@ OPENSSL_INLINE int CRYPTO_cpu_perf_is_like_silvermont(void) {
   // WARNING: This MUST NOT be used to guard the execution of the XSAVE
   // instruction. This is the "hardware supports XSAVE" bit, not the OSXSAVE bit
   // that indicates whether we can safely execute XSAVE. This bit may be set
-  // even when XSAVE is disabled (by the operating system). See the comment in
-  // cpu_intel.c and check how the users of this bit use it.
+  // even when XSAVE is disabled (by the operating system). See how the users of
+  // this bit use it.
   //
-  // We do not use |__XSAVE__| for static detection because the hack in
-  // |OPENSSL_cpuid_setup| for Knights Landing CPUs needs to override it.
+  // Historically, the XSAVE bit was artificially cleared on Knights Landing
+  // and Knights Mill chips, but as Intel has removed all support from GCC,
+  // LLVM, and SDE, we assume they are no longer worth special-casing.
   int hardware_supports_xsave = (OPENSSL_get_ia32cap(1) & (1u << 26)) != 0;
   return !hardware_supports_xsave && CRYPTO_is_MOVBE_capable();
 }
 
+OPENSSL_INLINE int CRYPTO_is_AVX512BW_capable(void) {
+#if defined(__AVX512BW__)
+  return 1;
+#else
+  return (OPENSSL_get_ia32cap(2) & (1u << 30)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_AVX512VL_capable(void) {
+#if defined(__AVX512VL__)
+  return 1;
+#else
+  return (OPENSSL_get_ia32cap(2) & (1u << 31)) != 0;
+#endif
+}
+
+// CRYPTO_cpu_avoid_zmm_registers returns 1 if zmm registers (512-bit vectors)
+// should not be used even if the CPU supports them.
+//
+// Note that this reuses the bit for the removed MPX feature.
+OPENSSL_INLINE int CRYPTO_cpu_avoid_zmm_registers(void) {
+  return (OPENSSL_get_ia32cap(2) & (1u << 14)) != 0;
+}
+
+OPENSSL_INLINE int CRYPTO_is_VAES_capable(void) {
+#if defined(__VAES__)
+  return 1;
+#else
+  return (OPENSSL_get_ia32cap(3) & (1u << 9)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_VPCLMULQDQ_capable(void) {
+#if defined(__VPCLMULQDQ__)
+  return 1;
+#else
+  return (OPENSSL_get_ia32cap(3) & (1u << 10)) != 0;
+#endif
+}
+
 #endif  // OPENSSL_X86 || OPENSSL_X86_64
 
 #if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
@@ -1535,9 +1615,9 @@ OPENSSL_INLINE int CRYPTO_cpu_perf_is_like_silvermont(void) {
 extern uint32_t OPENSSL_armcap_P;
 
 // OPENSSL_get_armcap initializes the library if needed and returns ARM CPU
-// capabilities. It is marked as a pure function so duplicate calls can be
-// merged by the compiler, at least when indices match.
-OPENSSL_ATTR_PURE uint32_t OPENSSL_get_armcap(void);
+// capabilities. It is marked as a const function so duplicate calls can be
+// merged by the compiler.
+OPENSSL_ATTR_CONST uint32_t OPENSSL_get_armcap(void);
 
 // We do not detect any features at runtime on several 32-bit Arm platforms.
 // Apple platforms and OpenBSD require NEON and moved to 64-bit to pick up Armv8
diff --git a/Sources/CCryptoBoringSSL/crypto/kyber/kyber.c b/Sources/CCryptoBoringSSL/crypto/kyber/kyber.c
index c84c4ed8..bd860118 100644
--- a/Sources/CCryptoBoringSSL/crypto/kyber/kyber.c
+++ b/Sources/CCryptoBoringSSL/crypto/kyber/kyber.c
@@ -12,6 +12,7 @@
  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 
+#define OPENSSL_UNSTABLE_EXPERIMENTAL_KYBER
 #include <experimental/CCryptoBoringSSL_kyber.h>
 
 #include <assert.h>
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md4/md4.c b/Sources/CCryptoBoringSSL/crypto/md4/md4.c
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md4/md4.c
rename to Sources/CCryptoBoringSSL/crypto/md4/md4.c
index 808c4667..2782e991 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md4/md4.c
+++ b/Sources/CCryptoBoringSSL/crypto/md4/md4.c
@@ -59,8 +59,8 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "../../internal.h"
-#include "../digest/md32_common.h"
+#include "../internal.h"
+#include "../crypto/fipsmodule/digest/md32_common.h"
 
 
 uint8_t *MD4(const uint8_t *data, size_t len, uint8_t out[MD4_DIGEST_LENGTH]) {
@@ -231,10 +231,3 @@ void md4_block_data_order(uint32_t *state, const uint8_t *data, size_t num) {
     D = state[3] += D;
   }
 }
-
-#undef F
-#undef G
-#undef H
-#undef R0
-#undef R1
-#undef R2
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5/internal.h b/Sources/CCryptoBoringSSL/crypto/md5/internal.h
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md5/internal.h
rename to Sources/CCryptoBoringSSL/crypto/md5/internal.h
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5/md5.c b/Sources/CCryptoBoringSSL/crypto/md5/md5.c
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md5/md5.c
rename to Sources/CCryptoBoringSSL/crypto/md5/md5.c
index 5394eb8f..fccbd6eb 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5/md5.c
+++ b/Sources/CCryptoBoringSSL/crypto/md5/md5.c
@@ -60,8 +60,8 @@
 
 #include <CCryptoBoringSSL_mem.h>
 
-#include "../../internal.h"
-#include "../digest/md32_common.h"
+#include "../internal.h"
+#include "../fipsmodule/digest/md32_common.h"
 #include "internal.h"
 
 
diff --git a/Sources/CCryptoBoringSSL/crypto/mem.c b/Sources/CCryptoBoringSSL/crypto/mem.c
index 5b6298c1..446b9d2d 100644
--- a/Sources/CCryptoBoringSSL/crypto/mem.c
+++ b/Sources/CCryptoBoringSSL/crypto/mem.c
@@ -94,7 +94,11 @@ static void __asan_unpoison_memory_region(const void *addr, size_t size) {}
 // Windows doesn't really support weak symbols as of May 2019, and Clang on
 // Windows will emit strong symbols instead. See
 // https://bugs.llvm.org/show_bug.cgi?id=37598
-#if defined(__ELF__) && defined(__GNUC__)
+//
+// EDK2 targets UEFI but builds as ELF and then translates the binary to
+// COFF(!). Thus it builds with __ELF__ defined but cannot actually cope with
+// weak symbols.
+#if !defined(__EDK2_BORINGSSL__) && defined(__ELF__) && defined(__GNUC__)
 #define WEAK_SYMBOL_FUNC(rettype, name, args) \
   rettype name args __attribute__((weak));
 #else
@@ -242,7 +246,7 @@ void *OPENSSL_malloc(size_t size) {
   __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);
   return ((uint8_t *)ptr) + OPENSSL_MALLOC_PREFIX;
 
- err:
+err:
   // This only works because ERR does not call OPENSSL_malloc.
   OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE);
   return NULL;
@@ -398,13 +402,8 @@ char *OPENSSL_strdup(const char *s) {
   if (s == NULL) {
     return NULL;
   }
-  const size_t len = strlen(s) + 1;
-  char *ret = OPENSSL_malloc(len);
-  if (ret == NULL) {
-    return NULL;
-  }
-  OPENSSL_memcpy(ret, s, len);
-  return ret;
+  // Copy the NUL terminator.
+  return OPENSSL_memdup(s, strlen(s) + 1);
 }
 
 int OPENSSL_isalpha(int c) {
@@ -528,7 +527,7 @@ int OPENSSL_vasprintf_internal(char **str, const char *format, va_list args,
   *str = candidate;
   return ret;
 
- err:
+err:
   deallocate(candidate);
   *str = NULL;
   errno = ENOMEM;
diff --git a/Sources/CCryptoBoringSSL/crypto/mldsa/internal.h b/Sources/CCryptoBoringSSL/crypto/mldsa/internal.h
new file mode 100644
index 00000000..dca55274
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/mldsa/internal.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2024, Google LLC
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_CRYPTO_MLDSA_INTERNAL_H
+#define OPENSSL_HEADER_CRYPTO_MLDSA_INTERNAL_H
+
+#include <CCryptoBoringSSL_base.h>
+#include <CCryptoBoringSSL_mldsa.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+// MLDSA_SIGNATURE_RANDOMIZER_BYTES is the number of bytes of uniformly
+// random entropy necessary to generate a signature in randomized mode.
+#define MLDSA_SIGNATURE_RANDOMIZER_BYTES 32
+
+// MLDSA65_generate_key_external_entropy generates a public/private key pair
+// using the given seed, writes the encoded public key to
+// |out_encoded_public_key| and sets |out_private_key| to the private key.
+// It returns 1 on success and 0 on failure.
+OPENSSL_EXPORT int MLDSA65_generate_key_external_entropy(
+    uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES],
+    struct MLDSA65_private_key *out_private_key,
+    const uint8_t entropy[MLDSA_SEED_BYTES]);
+
+// MLDSA65_sign_internal signs |msg| using |private_key| and writes the
+// signature to |out_encoded_signature|. The |context_prefix| and |context| are
+// prefixed to the message, in that order, before signing. The |randomizer|
+// value can be set to zero bytes in order to make a deterministic signature, or
+// else filled with entropy for the usual |MLDSA_sign| behavior. It returns 1 on
+// success and 0 on error.
+OPENSSL_EXPORT int MLDSA65_sign_internal(
+    uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES],
+    const struct MLDSA65_private_key *private_key, const uint8_t *msg,
+    size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len,
+    const uint8_t *context, size_t context_len,
+    const uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES]);
+
+// MLDSA65_verify_internal verifies that |encoded_signature| is a valid
+// signature of |msg| by |public_key|. The |context_prefix| and |context| are
+// prefixed to the message before verification, in that order. It returns 1 on
+// success and 0 on error.
+OPENSSL_EXPORT int MLDSA65_verify_internal(
+    const struct MLDSA65_public_key *public_key,
+    const uint8_t encoded_signature[MLDSA65_SIGNATURE_BYTES],
+    const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix,
+    size_t context_prefix_len, const uint8_t *context, size_t context_len);
+
+// MLDSA65_marshal_private_key serializes |private_key| to |out| in the
+// NIST format for ML-DSA-65 private keys. It returns 1 on success or 0
+// on allocation error.
+OPENSSL_EXPORT int MLDSA65_marshal_private_key(
+    CBB *out, const struct MLDSA65_private_key *private_key);
+
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
+
+#endif  // OPENSSL_HEADER_CRYPTO_MLDSA_INTERNAL_H
diff --git a/Sources/CCryptoBoringSSL/crypto/mldsa/mldsa.c b/Sources/CCryptoBoringSSL/crypto/mldsa/mldsa.c
new file mode 100644
index 00000000..8454debd
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/mldsa/mldsa.c
@@ -0,0 +1,1687 @@
+/* Copyright (c) 2024, Google LLC
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <CCryptoBoringSSL_mldsa.h>
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include <CCryptoBoringSSL_bytestring.h>
+#include <CCryptoBoringSSL_mem.h>
+#include <CCryptoBoringSSL_rand.h>
+
+#include "../internal.h"
+#include "../keccak/internal.h"
+#include "./internal.h"
+
+#define DEGREE 256
+#define K 6
+#define L 5
+#define ETA 4
+#define TAU 49
+#define BETA 196
+#define OMEGA 55
+
+#define RHO_BYTES 32
+#define SIGMA_BYTES 64
+#define K_BYTES 32
+#define TR_BYTES 64
+#define MU_BYTES 64
+#define RHO_PRIME_BYTES 64
+#define LAMBDA_BITS 192
+#define LAMBDA_BYTES (LAMBDA_BITS / 8)
+
+// 2^23 - 2^13 + 1
+static const uint32_t kPrime = 8380417;
+// Inverse of -kPrime modulo 2^32
+static const uint32_t kPrimeNegInverse = 4236238847;
+static const int kDroppedBits = 13;
+static const uint32_t kHalfPrime = (8380417 - 1) / 2;
+static const uint32_t kGamma1 = 1 << 19;
+static const uint32_t kGamma2 = (8380417 - 1) / 32;
+// 256^-1 mod kPrime, in Montgomery form.
+static const uint32_t kInverseDegreeMontgomery = 41978;
+
+typedef struct scalar {
+  uint32_t c[DEGREE];
+} scalar;
+
+typedef struct vectork {
+  scalar v[K];
+} vectork;
+
+typedef struct vectorl {
+  scalar v[L];
+} vectorl;
+
+typedef struct matrix {
+  scalar v[K][L];
+} matrix;
+
+/* Arithmetic */
+
+// This bit of Python will be referenced in some of the following comments:
+//
+// q = 8380417
+// # Inverse of -q modulo 2^32
+// q_neg_inverse = 4236238847
+// # 2^64 modulo q
+// montgomery_square = 2365951
+//
+// def bitreverse(i):
+//     ret = 0
+//     for n in range(8):
+//         bit = i & 1
+//         ret <<= 1
+//         ret |= bit
+//         i >>= 1
+//     return ret
+//
+// def montgomery_reduce(x):
+//     a = (x * q_neg_inverse) % 2**32
+//     b = x + a * q
+//     assert b & 0xFFFF_FFFF == 0
+//     c = b >> 32
+//     assert c < q
+//     return c
+//
+// def montgomery_transform(x):
+//     return montgomery_reduce(x * montgomery_square)
+
+// kNTTRootsMontgomery = [
+//   montgomery_transform(pow(1753, bitreverse(i), q)) for i in range(256)
+// ]
+static const uint32_t kNTTRootsMontgomery[256] = {
+    4193792, 25847,   5771523, 7861508, 237124,  7602457, 7504169, 466468,
+    1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103,
+    2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868,
+    6262231, 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005,
+    2706023, 95776,   3077325, 3530437, 6718724, 4788269, 5842901, 3915439,
+    4519302, 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118,
+    6681150, 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596,
+    811944,  531354,  954230,  3881043, 3900724, 5823537, 2071892, 5582638,
+    4450022, 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196,
+    7122806, 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922,
+    3412210, 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370,
+    7709315, 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987,
+    5037034, 264944,  508951,  3097992, 44288,   7280319, 904516,  3958618,
+    4656075, 8371839, 1653064, 5130689, 2389356, 8169440, 759969,  7063561,
+    189548,  4827145, 3159746, 6529015, 5971092, 8202977, 1315589, 1341330,
+    1285669, 6795489, 7567685, 6940675, 5361315, 4499357, 4751448, 3839961,
+    2091667, 3407706, 2316500, 3817976, 5037939, 2244091, 5933984, 4817955,
+    266997,  2434439, 7144689, 3513181, 4860065, 4621053, 7183191, 5187039,
+    900702,  1859098, 909542,  819034,  495491,  6767243, 8337157, 7857917,
+    7725090, 5257975, 2031748, 3207046, 4823422, 7855319, 7611795, 4784579,
+    342297,  286988,  5942594, 4108315, 3437287, 5038140, 1735879, 203044,
+    2842341, 2691481, 5790267, 1265009, 4055324, 1247620, 2486353, 1595974,
+    4613401, 1250494, 2635921, 4832145, 5386378, 1869119, 1903435, 7329447,
+    7047359, 1237275, 5062207, 6950192, 7929317, 1312455, 3306115, 6417775,
+    7100756, 1917081, 5834105, 7005614, 1500165, 777191,  2235880, 3406031,
+    7838005, 5548557, 6709241, 6533464, 5796124, 4656147, 594136,  4603424,
+    6366809, 2432395, 2454455, 8215696, 1957272, 3369112, 185531,  7173032,
+    5196991, 162844,  1616392, 3014001, 810149,  1652634, 4686184, 6581310,
+    5341501, 3523897, 3866901, 269760,  2213111, 7404533, 1717735, 472078,
+    7953734, 1723600, 6577327, 1910376, 6712985, 7276084, 8119771, 4546524,
+    5441381, 6144432, 7959518, 6094090, 183443,  7403526, 1612842, 4834730,
+    7826001, 3919660, 8332111, 7018208, 3937738, 1400424, 7534263, 1976782};
+
+// Reduces x mod kPrime in constant time, where 0 <= x < 2*kPrime.
+static uint32_t reduce_once(uint32_t x) {
+  declassify_assert(x < 2 * kPrime);
+  // return x < kPrime ? x : x - kPrime;
+  return constant_time_select_int(constant_time_lt_w(x, kPrime), x, x - kPrime);
+}
+
+// Returns the absolute value in constant time.
+static uint32_t abs_signed(uint32_t x) {
+  // return is_positive(x) ? x : -x;
+  // Note: MSVC doesn't like applying the unary minus operator to unsigned types
+  // (warning C4146), so we write the negation as a bitwise not plus one
+  // (assuming two's complement representation).
+  return constant_time_select_int(constant_time_lt_w(x, 0x80000000), x, 0u - x);
+}
+
+// Returns the absolute value modulo kPrime.
+static uint32_t abs_mod_prime(uint32_t x) {
+  declassify_assert(x < kPrime);
+  // return x > kHalfPrime ? kPrime - x : x;
+  return constant_time_select_int(constant_time_lt_w(kHalfPrime, x), kPrime - x,
+                                  x);
+}
+
+// Returns the maximum of two values in constant time.
+static uint32_t maximum(uint32_t x, uint32_t y) {
+  // return x < y ? y : x;
+  return constant_time_select_int(constant_time_lt_w(x, y), y, x);
+}
+
+static uint32_t mod_sub(uint32_t a, uint32_t b) {
+  declassify_assert(a < kPrime);
+  declassify_assert(b < kPrime);
+  return reduce_once(kPrime + a - b);
+}
+
+static void scalar_add(scalar *out, const scalar *lhs, const scalar *rhs) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = reduce_once(lhs->c[i] + rhs->c[i]);
+  }
+}
+
+static void scalar_sub(scalar *out, const scalar *lhs, const scalar *rhs) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = mod_sub(lhs->c[i], rhs->c[i]);
+  }
+}
+
+static uint32_t reduce_montgomery(uint64_t x) {
+  declassify_assert(x <= ((uint64_t)kPrime << 32));
+  uint64_t a = (uint32_t)x * kPrimeNegInverse;
+  uint64_t b = x + a * kPrime;
+  declassify_assert((b & 0xffffffff) == 0);
+  uint32_t c = b >> 32;
+  return reduce_once(c);
+}
+
+// Multiply two scalars in the number theoretically transformed state.
+static void scalar_mult(scalar *out, const scalar *lhs, const scalar *rhs) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = reduce_montgomery((uint64_t)lhs->c[i] * (uint64_t)rhs->c[i]);
+  }
+}
+
+// In place number theoretic transform of a given scalar.
+//
+// FIPS 204, Algorithm 41 (`NTT`).
+static void scalar_ntt(scalar *s) {
+  // Step: 1, 2, 4, 8, ..., 128
+  // Offset: 128, 64, 32, 16, ..., 1
+  int offset = DEGREE;
+  for (int step = 1; step < DEGREE; step <<= 1) {
+    offset >>= 1;
+    int k = 0;
+    for (int i = 0; i < step; i++) {
+      assert(k == 2 * offset * i);
+      const uint32_t step_root = kNTTRootsMontgomery[step + i];
+      for (int j = k; j < k + offset; j++) {
+        uint32_t even = s->c[j];
+        // |reduce_montgomery| works on values up to kPrime*R and R > 2*kPrime.
+        // |step_root| < kPrime because it's static data. |s->c[...]| is <
+        // kPrime by the invariants of that struct.
+        uint32_t odd =
+            reduce_montgomery((uint64_t)step_root * (uint64_t)s->c[j + offset]);
+        s->c[j] = reduce_once(odd + even);
+        s->c[j + offset] = mod_sub(even, odd);
+      }
+      k += 2 * offset;
+    }
+  }
+}
+
+// In place inverse number theoretic transform of a given scalar.
+//
+// FIPS 204, Algorithm 42 (`NTT^-1`).
+static void scalar_inverse_ntt(scalar *s) {
+  // Step: 128, 64, 32, 16, ..., 1
+  // Offset: 1, 2, 4, 8, ..., 128
+  int step = DEGREE;
+  for (int offset = 1; offset < DEGREE; offset <<= 1) {
+    step >>= 1;
+    int k = 0;
+    for (int i = 0; i < step; i++) {
+      assert(k == 2 * offset * i);
+      const uint32_t step_root =
+          kPrime - kNTTRootsMontgomery[step + (step - 1 - i)];
+      for (int j = k; j < k + offset; j++) {
+        uint32_t even = s->c[j];
+        uint32_t odd = s->c[j + offset];
+        s->c[j] = reduce_once(odd + even);
+
+        // |reduce_montgomery| works on values up to kPrime*R and R > 2*kPrime.
+        // kPrime + even < 2*kPrime because |even| < kPrime, by the invariants
+        // of that structure. Thus kPrime + even - odd < 2*kPrime because odd >=
+        // 0, because it's unsigned and less than kPrime. Lastly step_root <
+        // kPrime, because |kNTTRootsMontgomery| is static data.
+        s->c[j + offset] = reduce_montgomery((uint64_t)step_root *
+                                             (uint64_t)(kPrime + even - odd));
+      }
+      k += 2 * offset;
+    }
+  }
+  for (int i = 0; i < DEGREE; i++) {
+    s->c[i] = reduce_montgomery((uint64_t)s->c[i] *
+                                (uint64_t)kInverseDegreeMontgomery);
+  }
+}
+
+static void vectork_zero(vectork *out) { OPENSSL_memset(out, 0, sizeof(*out)); }
+
+static void vectork_add(vectork *out, const vectork *lhs, const vectork *rhs) {
+  for (int i = 0; i < K; i++) {
+    scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]);
+  }
+}
+
+static void vectork_sub(vectork *out, const vectork *lhs, const vectork *rhs) {
+  for (int i = 0; i < K; i++) {
+    scalar_sub(&out->v[i], &lhs->v[i], &rhs->v[i]);
+  }
+}
+
+static void vectork_mult_scalar(vectork *out, const vectork *lhs,
+                                const scalar *rhs) {
+  for (int i = 0; i < K; i++) {
+    scalar_mult(&out->v[i], &lhs->v[i], rhs);
+  }
+}
+
+static void vectork_ntt(vectork *a) {
+  for (int i = 0; i < K; i++) {
+    scalar_ntt(&a->v[i]);
+  }
+}
+
+static void vectork_inverse_ntt(vectork *a) {
+  for (int i = 0; i < K; i++) {
+    scalar_inverse_ntt(&a->v[i]);
+  }
+}
+
+static void vectorl_add(vectorl *out, const vectorl *lhs, const vectorl *rhs) {
+  for (int i = 0; i < L; i++) {
+    scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]);
+  }
+}
+
+static void vectorl_mult_scalar(vectorl *out, const vectorl *lhs,
+                                const scalar *rhs) {
+  for (int i = 0; i < L; i++) {
+    scalar_mult(&out->v[i], &lhs->v[i], rhs);
+  }
+}
+
+static void vectorl_ntt(vectorl *a) {
+  for (int i = 0; i < L; i++) {
+    scalar_ntt(&a->v[i]);
+  }
+}
+
+static void vectorl_inverse_ntt(vectorl *a) {
+  for (int i = 0; i < L; i++) {
+    scalar_inverse_ntt(&a->v[i]);
+  }
+}
+
+static void matrix_mult(vectork *out, const matrix *m, const vectorl *a) {
+  vectork_zero(out);
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < L; j++) {
+      scalar product;
+      scalar_mult(&product, &m->v[i][j], &a->v[j]);
+      scalar_add(&out->v[i], &out->v[i], &product);
+    }
+  }
+}
+
+/* Rounding & hints */
+
+// FIPS 204, Algorithm 35 (`Power2Round`).
+static void power2_round(uint32_t *r1, uint32_t *r0, uint32_t r) {
+  *r1 = r >> kDroppedBits;
+  *r0 = r - (*r1 << kDroppedBits);
+
+  uint32_t r0_adjusted = mod_sub(*r0, 1 << kDroppedBits);
+  uint32_t r1_adjusted = *r1 + 1;
+
+  // Mask is set iff r0 > 2^(dropped_bits - 1).
+  crypto_word_t mask =
+      constant_time_lt_w((uint32_t)(1 << (kDroppedBits - 1)), *r0);
+  // r0 = mask ? r0_adjusted : r0
+  *r0 = constant_time_select_int(mask, r0_adjusted, *r0);
+  // r1 = mask ? r1_adjusted : r1
+  *r1 = constant_time_select_int(mask, r1_adjusted, *r1);
+}
+
+// Scale back previously rounded value.
+static void scale_power2_round(uint32_t *out, uint32_t r1) {
+  // Pre-condition: 0 <= r1 <= 2^10 - 1
+  assert(r1 < (1u << 10));
+
+  *out = r1 << kDroppedBits;
+
+  // Post-condition: 0 <= out <= 2^23 - 2^13 = kPrime - 1
+  assert(*out < kPrime);
+}
+
+// FIPS 204, Algorithm 37 (`HighBits`).
+static uint32_t high_bits(uint32_t x) {
+  // Reference description (given 0 <= x < q):
+  //
+  // ```
+  // int32_t r0 = x mod+- (2 * kGamma2);
+  // if (x - r0 == q - 1) {
+  //   return 0;
+  // } else {
+  //   return (x - r0) / (2 * kGamma2);
+  // }
+  // ```
+  //
+  // Below is the formula taken from the reference implementation.
+  //
+  // Here, kGamma2 == 2^18 - 2^8
+  // This returns ((ceil(x / 2^7) * (2^10 + 1) + 2^21) / 2^22) mod 2^4
+  uint32_t r1 = (x + 127) >> 7;
+  r1 = (r1 * 1025 + (1 << 21)) >> 22;
+  r1 &= 15;
+  return r1;
+}
+
+// FIPS 204, Algorithm 36 (`Decompose`).
+static void decompose(uint32_t *r1, int32_t *r0, uint32_t r) {
+  *r1 = high_bits(r);
+
+  *r0 = r;
+  *r0 -= *r1 * 2 * (int32_t)kGamma2;
+  *r0 -= (((int32_t)kHalfPrime - *r0) >> 31) & (int32_t)kPrime;
+}
+
+// FIPS 204, Algorithm 38 (`LowBits`).
+static int32_t low_bits(uint32_t x) {
+  uint32_t r1;
+  int32_t r0;
+  decompose(&r1, &r0, x);
+  return r0;
+}
+
+// FIPS 204, Algorithm 39 (`MakeHint`).
+//
+// In the spec this takes two arguments, z and r, and is called with
+//   z = -ct0
+//   r = w - cs2 + ct0
+//
+// It then computes HighBits (algorithm 37) of z and z+r. But z+r is just w -
+// cs2, so this takes three arguments and saves an addition.
+static int32_t make_hint(uint32_t ct0, uint32_t cs2, uint32_t w) {
+  uint32_t r_plus_z = mod_sub(w, cs2);
+  uint32_t r = reduce_once(r_plus_z + ct0);
+  return high_bits(r) != high_bits(r_plus_z);
+}
+
+// FIPS 204, Algorithm 40 (`UseHint`).
+static uint32_t use_hint_vartime(uint32_t h, uint32_t r) {
+  uint32_t r1;
+  int32_t r0;
+  decompose(&r1, &r0, r);
+
+  if (h) {
+    if (r0 > 0) {
+      // m = 16, thus |mod m| in the spec turns into |& 15|.
+      return (r1 + 1) & 15;
+    } else {
+      return (r1 - 1) & 15;
+    }
+  }
+  return r1;
+}
+
+static void scalar_power2_round(scalar *s1, scalar *s0, const scalar *s) {
+  for (int i = 0; i < DEGREE; i++) {
+    power2_round(&s1->c[i], &s0->c[i], s->c[i]);
+  }
+}
+
+static void scalar_scale_power2_round(scalar *out, const scalar *in) {
+  for (int i = 0; i < DEGREE; i++) {
+    scale_power2_round(&out->c[i], in->c[i]);
+  }
+}
+
+static void scalar_high_bits(scalar *out, const scalar *in) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = high_bits(in->c[i]);
+  }
+}
+
+static void scalar_low_bits(scalar *out, const scalar *in) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = low_bits(in->c[i]);
+  }
+}
+
+static void scalar_max(uint32_t *max, const scalar *s) {
+  for (int i = 0; i < DEGREE; i++) {
+    uint32_t abs = abs_mod_prime(s->c[i]);
+    *max = maximum(*max, abs);
+  }
+}
+
+static void scalar_max_signed(uint32_t *max, const scalar *s) {
+  for (int i = 0; i < DEGREE; i++) {
+    uint32_t abs = abs_signed(s->c[i]);
+    *max = maximum(*max, abs);
+  }
+}
+
+static void scalar_make_hint(scalar *out, const scalar *ct0, const scalar *cs2,
+                             const scalar *w) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = make_hint(ct0->c[i], cs2->c[i], w->c[i]);
+  }
+}
+
+static void scalar_use_hint_vartime(scalar *out, const scalar *h,
+                                    const scalar *r) {
+  for (int i = 0; i < DEGREE; i++) {
+    out->c[i] = use_hint_vartime(h->c[i], r->c[i]);
+  }
+}
+
+static void vectork_power2_round(vectork *t1, vectork *t0, const vectork *t) {
+  for (int i = 0; i < K; i++) {
+    scalar_power2_round(&t1->v[i], &t0->v[i], &t->v[i]);
+  }
+}
+
+static void vectork_scale_power2_round(vectork *out, const vectork *in) {
+  for (int i = 0; i < K; i++) {
+    scalar_scale_power2_round(&out->v[i], &in->v[i]);
+  }
+}
+
+static void vectork_high_bits(vectork *out, const vectork *in) {
+  for (int i = 0; i < K; i++) {
+    scalar_high_bits(&out->v[i], &in->v[i]);
+  }
+}
+
+static void vectork_low_bits(vectork *out, const vectork *in) {
+  for (int i = 0; i < K; i++) {
+    scalar_low_bits(&out->v[i], &in->v[i]);
+  }
+}
+
+static uint32_t vectork_max(const vectork *a) {
+  uint32_t max = 0;
+  for (int i = 0; i < K; i++) {
+    scalar_max(&max, &a->v[i]);
+  }
+  return max;
+}
+
+static uint32_t vectork_max_signed(const vectork *a) {
+  uint32_t max = 0;
+  for (int i = 0; i < K; i++) {
+    scalar_max_signed(&max, &a->v[i]);
+  }
+  return max;
+}
+
+// The input vector contains only zeroes and ones.
+static size_t vectork_count_ones(const vectork *a) {
+  size_t count = 0;
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < DEGREE; j++) {
+      count += a->v[i].c[j];
+    }
+  }
+  return count;
+}
+
+static void vectork_make_hint(vectork *out, const vectork *ct0,
+                              const vectork *cs2, const vectork *w) {
+  for (int i = 0; i < K; i++) {
+    scalar_make_hint(&out->v[i], &ct0->v[i], &cs2->v[i], &w->v[i]);
+  }
+}
+
+static void vectork_use_hint_vartime(vectork *out, const vectork *h,
+                                     const vectork *r) {
+  for (int i = 0; i < K; i++) {
+    scalar_use_hint_vartime(&out->v[i], &h->v[i], &r->v[i]);
+  }
+}
+
+static uint32_t vectorl_max(const vectorl *a) {
+  uint32_t max = 0;
+  for (int i = 0; i < L; i++) {
+    scalar_max(&max, &a->v[i]);
+  }
+  return max;
+}
+
+/* Bit packing */
+
+// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 4.
+static void scalar_encode_4(uint8_t out[128], const scalar *s) {
+  // Every two elements lands on a byte boundary.
+  static_assert(DEGREE % 2 == 0, "DEGREE must be a multiple of 2");
+  for (int i = 0; i < DEGREE / 2; i++) {
+    uint32_t a = s->c[2 * i];
+    uint32_t b = s->c[2 * i + 1];
+    declassify_assert(a < 16);
+    declassify_assert(b < 16);
+    out[i] = a | (b << 4);
+  }
+}
+
+// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 10.
+static void scalar_encode_10(uint8_t out[320], const scalar *s) {
+  // Every four elements lands on a byte boundary.
+  static_assert(DEGREE % 4 == 0, "DEGREE must be a multiple of 4");
+  for (int i = 0; i < DEGREE / 4; i++) {
+    uint32_t a = s->c[4 * i];
+    uint32_t b = s->c[4 * i + 1];
+    uint32_t c = s->c[4 * i + 2];
+    uint32_t d = s->c[4 * i + 3];
+    declassify_assert(a < 1024);
+    declassify_assert(b < 1024);
+    declassify_assert(c < 1024);
+    declassify_assert(d < 1024);
+    out[5 * i] = (uint8_t)a;
+    out[5 * i + 1] = (uint8_t)((a >> 8) | (b << 2));
+    out[5 * i + 2] = (uint8_t)((b >> 6) | (c << 4));
+    out[5 * i + 3] = (uint8_t)((c >> 4) | (d << 6));
+    out[5 * i + 4] = (uint8_t)(d >> 2);
+  }
+}
+
+// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 4 and b =
+// 2^19.
+static void scalar_encode_signed_4_eta(uint8_t out[128], const scalar *s) {
+  // Every two elements lands on a byte boundary.
+  static_assert(DEGREE % 2 == 0, "DEGREE must be a multiple of 2");
+  for (int i = 0; i < DEGREE / 2; i++) {
+    uint32_t a = mod_sub(ETA, s->c[2 * i]);
+    uint32_t b = mod_sub(ETA, s->c[2 * i + 1]);
+    declassify_assert(a < 16);
+    declassify_assert(b < 16);
+    out[i] = a | (b << 4);
+  }
+}
+
+// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 13 and b =
+// 2^12.
+static void scalar_encode_signed_13_12(uint8_t out[416], const scalar *s) {
+  static const uint32_t kMax = 1u << 12;
+  // Every two elements lands on a byte boundary.
+  static_assert(DEGREE % 8 == 0, "DEGREE must be a multiple of 8");
+  for (int i = 0; i < DEGREE / 8; i++) {
+    uint32_t a = mod_sub(kMax, s->c[8 * i]);
+    uint32_t b = mod_sub(kMax, s->c[8 * i + 1]);
+    uint32_t c = mod_sub(kMax, s->c[8 * i + 2]);
+    uint32_t d = mod_sub(kMax, s->c[8 * i + 3]);
+    uint32_t e = mod_sub(kMax, s->c[8 * i + 4]);
+    uint32_t f = mod_sub(kMax, s->c[8 * i + 5]);
+    uint32_t g = mod_sub(kMax, s->c[8 * i + 6]);
+    uint32_t h = mod_sub(kMax, s->c[8 * i + 7]);
+    declassify_assert(a < (1u << 13));
+    declassify_assert(b < (1u << 13));
+    declassify_assert(c < (1u << 13));
+    declassify_assert(d < (1u << 13));
+    declassify_assert(e < (1u << 13));
+    declassify_assert(f < (1u << 13));
+    declassify_assert(g < (1u << 13));
+    declassify_assert(h < (1u << 13));
+    a |= b << 13;
+    a |= c << 26;
+    c >>= 6;
+    c |= d << 7;
+    c |= e << 20;
+    e >>= 12;
+    e |= f << 1;
+    e |= g << 14;
+    e |= h << 27;
+    h >>= 5;
+    OPENSSL_memcpy(&out[13 * i], &a, sizeof(a));
+    OPENSSL_memcpy(&out[13 * i + 4], &c, sizeof(c));
+    OPENSSL_memcpy(&out[13 * i + 8], &e, sizeof(e));
+    OPENSSL_memcpy(&out[13 * i + 12], &h, 1);
+  }
+}
+
+// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(b) = 20 and b =
+// 2^19.
+static void scalar_encode_signed_20_19(uint8_t out[640], const scalar *s) {
+  static const uint32_t kMax = 1u << 19;
+  // Every two elements lands on a byte boundary.
+  static_assert(DEGREE % 4 == 0, "DEGREE must be a multiple of 4");
+  for (int i = 0; i < DEGREE / 4; i++) {
+    uint32_t a = mod_sub(kMax, s->c[4 * i]);
+    uint32_t b = mod_sub(kMax, s->c[4 * i + 1]);
+    uint32_t c = mod_sub(kMax, s->c[4 * i + 2]);
+    uint32_t d = mod_sub(kMax, s->c[4 * i + 3]);
+    declassify_assert(a < (1u << 20));
+    declassify_assert(b < (1u << 20));
+    declassify_assert(c < (1u << 20));
+    declassify_assert(d < (1u << 20));
+    a |= b << 20;
+    b >>= 12;
+    b |= c << 8;
+    b |= d << 28;
+    d >>= 4;
+    OPENSSL_memcpy(&out[10 * i], &a, sizeof(a));
+    OPENSSL_memcpy(&out[10 * i + 4], &b, sizeof(b));
+    OPENSSL_memcpy(&out[10 * i + 8], &d, 2);
+  }
+}
+
+// FIPS 204, Algorithm 17 (`BitPack`).
+static void scalar_encode_signed(uint8_t *out, const scalar *s, int bits,
+                                 uint32_t max) {
+  if (bits == 4) {
+    assert(max == ETA);
+    scalar_encode_signed_4_eta(out, s);
+  } else if (bits == 20) {
+    assert(max == 1u << 19);
+    scalar_encode_signed_20_19(out, s);
+  } else {
+    assert(bits == 13);
+    assert(max == 1u << 12);
+    scalar_encode_signed_13_12(out, s);
+  }
+}
+
+// FIPS 204, Algorithm 18 (`SimpleBitUnpack`). Specialized for bitlen(b) == 10.
+static void scalar_decode_10(scalar *out, const uint8_t in[320]) {
+  uint32_t v;
+  static_assert(DEGREE % 4 == 0, "DEGREE must be a multiple of 4");
+  for (int i = 0; i < DEGREE / 4; i++) {
+    OPENSSL_memcpy(&v, &in[5 * i], sizeof(v));
+    out->c[4 * i] = v & 0x3ff;
+    out->c[4 * i + 1] = (v >> 10) & 0x3ff;
+    out->c[4 * i + 2] = (v >> 20) & 0x3ff;
+    out->c[4 * i + 3] = (v >> 30) | (((uint32_t)in[5 * i + 4]) << 2);
+  }
+}
+
+// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 4 and b =
+// eta.
+static int scalar_decode_signed_4_eta(scalar *out, const uint8_t in[128]) {
+  uint32_t v;
+  static_assert(DEGREE % 8 == 0, "DEGREE must be a multiple of 8");
+  for (int i = 0; i < DEGREE / 8; i++) {
+    OPENSSL_memcpy(&v, &in[4 * i], sizeof(v));
+    static_assert(ETA == 4, "ETA must be 4");
+    // None of the nibbles may be >= 9. So if the MSB of any nibble is set, none
+    // of the other bits may be set. First, select all the MSBs.
+    const uint32_t msbs = v & 0x88888888u;
+    // For each nibble where the MSB is set, form a mask of all the other bits.
+    const uint32_t mask = (msbs >> 1) | (msbs >> 2) | (msbs >> 3);
+    // A nibble is only out of range in the case of invalid input, in which case
+    // it is okay to leak the value.
+    if (constant_time_declassify_int((mask & v) != 0)) {
+      return 0;
+    }
+
+    out->c[i * 8] = mod_sub(ETA, v & 15);
+    out->c[i * 8 + 1] = mod_sub(ETA, (v >> 4) & 15);
+    out->c[i * 8 + 2] = mod_sub(ETA, (v >> 8) & 15);
+    out->c[i * 8 + 3] = mod_sub(ETA, (v >> 12) & 15);
+    out->c[i * 8 + 4] = mod_sub(ETA, (v >> 16) & 15);
+    out->c[i * 8 + 5] = mod_sub(ETA, (v >> 20) & 15);
+    out->c[i * 8 + 6] = mod_sub(ETA, (v >> 24) & 15);
+    out->c[i * 8 + 7] = mod_sub(ETA, v >> 28);
+  }
+  return 1;
+}
+
+// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 13 and b =
+// 2^12.
+static void scalar_decode_signed_13_12(scalar *out, const uint8_t in[416]) {
+  static const uint32_t kMax = 1u << 12;
+  static const uint32_t k13Bits = (1u << 13) - 1;
+  static const uint32_t k7Bits = (1u << 7) - 1;
+
+  uint32_t a, b, c;
+  uint8_t d;
+  static_assert(DEGREE % 8 == 0, "DEGREE must be a multiple of 8");
+  for (int i = 0; i < DEGREE / 8; i++) {
+    OPENSSL_memcpy(&a, &in[13 * i], sizeof(a));
+    OPENSSL_memcpy(&b, &in[13 * i + 4], sizeof(b));
+    OPENSSL_memcpy(&c, &in[13 * i + 8], sizeof(c));
+    d = in[13 * i + 12];
+
+    // It's not possible for a 13-bit number to be out of range when the max is
+    // 2^12.
+    out->c[i * 8] = mod_sub(kMax, a & k13Bits);
+    out->c[i * 8 + 1] = mod_sub(kMax, (a >> 13) & k13Bits);
+    out->c[i * 8 + 2] = mod_sub(kMax, (a >> 26) | ((b & k7Bits) << 6));
+    out->c[i * 8 + 3] = mod_sub(kMax, (b >> 7) & k13Bits);
+    out->c[i * 8 + 4] = mod_sub(kMax, (b >> 20) | ((c & 1) << 12));
+    out->c[i * 8 + 5] = mod_sub(kMax, (c >> 1) & k13Bits);
+    out->c[i * 8 + 6] = mod_sub(kMax, (c >> 14) & k13Bits);
+    out->c[i * 8 + 7] = mod_sub(kMax, (c >> 27) | ((uint32_t)d) << 5);
+  }
+}
+
+// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 20 and b =
+// 2^19.
+static void scalar_decode_signed_20_19(scalar *out, const uint8_t in[640]) {
+  static const uint32_t kMax = 1u << 19;
+  static const uint32_t k20Bits = (1u << 20) - 1;
+
+  uint32_t a, b;
+  uint16_t c;
+  static_assert(DEGREE % 4 == 0, "DEGREE must be a multiple of 4");
+  for (int i = 0; i < DEGREE / 4; i++) {
+    OPENSSL_memcpy(&a, &in[10 * i], sizeof(a));
+    OPENSSL_memcpy(&b, &in[10 * i + 4], sizeof(b));
+    OPENSSL_memcpy(&c, &in[10 * i + 8], sizeof(c));
+
+    // It's not possible for a 20-bit number to be out of range when the max is
+    // 2^19.
+    out->c[i * 4] = mod_sub(kMax, a & k20Bits);
+    out->c[i * 4 + 1] = mod_sub(kMax, (a >> 20) | ((b & 0xff) << 12));
+    out->c[i * 4 + 2] = mod_sub(kMax, (b >> 8) & k20Bits);
+    out->c[i * 4 + 3] = mod_sub(kMax, (b >> 28) | ((uint32_t)c) << 4);
+  }
+}
+
+// FIPS 204, Algorithm 19 (`BitUnpack`).
+static int scalar_decode_signed(scalar *out, const uint8_t *in, int bits,
+                                uint32_t max) {
+  if (bits == 4) {
+    assert(max == ETA);
+    return scalar_decode_signed_4_eta(out, in);
+  } else if (bits == 13) {
+    assert(max == (1u << 12));
+    scalar_decode_signed_13_12(out, in);
+    return 1;
+  } else if (bits == 20) {
+    assert(max == (1u << 19));
+    scalar_decode_signed_20_19(out, in);
+    return 1;
+  } else {
+    abort();
+  }
+}
+
+/* Expansion functions */
+
+// FIPS 204, Algorithm 30 (`RejNTTPoly`).
+//
+// Rejection samples a Keccak stream to get uniformly distributed elements. This
+// is used for matrix expansion and only operates on public inputs.
+static void scalar_from_keccak_vartime(
+    scalar *out, const uint8_t derived_seed[RHO_BYTES + 2]) {
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake128);
+  BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, RHO_BYTES + 2);
+  assert(keccak_ctx.squeeze_offset == 0);
+  assert(keccak_ctx.rate_bytes == 168);
+  static_assert(168 % 3 == 0, "block and coefficient boundaries do not align");
+
+  int done = 0;
+  while (done < DEGREE) {
+    uint8_t block[168];
+    BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+    for (size_t i = 0; i < sizeof(block) && done < DEGREE; i += 3) {
+      // FIPS 204, Algorithm 14 (`CoeffFromThreeBytes`).
+      uint32_t value = (uint32_t)block[i] | ((uint32_t)block[i + 1] << 8) |
+                       (((uint32_t)block[i + 2] & 0x7f) << 16);
+      if (value < kPrime) {
+        out->c[done++] = value;
+      }
+    }
+  }
+}
+
+// FIPS 204, Algorithm 31 (`RejBoundedPoly`).
+static void scalar_uniform_eta_4(scalar *out,
+                                 const uint8_t derived_seed[SIGMA_BYTES + 2]) {
+  static_assert(ETA == 4, "This implementation is specialized for ETA == 4");
+
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, SIGMA_BYTES + 2);
+  assert(keccak_ctx.squeeze_offset == 0);
+  assert(keccak_ctx.rate_bytes == 136);
+
+  int done = 0;
+  while (done < DEGREE) {
+    uint8_t block[136];
+    BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+    for (size_t i = 0; i < sizeof(block) && done < DEGREE; ++i) {
+      uint32_t t0 = block[i] & 0x0F;
+      uint32_t t1 = block[i] >> 4;
+      // FIPS 204, Algorithm 15 (`CoefFromHalfByte`). Although both the input
+      // and output here are secret, it is OK to leak when we rejected a byte.
+      // Individual bytes of the SHAKE-256 stream are (indistiguishable from)
+      // independent of each other and the original seed, so leaking information
+      // about the rejected bytes does not reveal the input or output.
+      if (constant_time_declassify_int(t0 < 9)) {
+        out->c[done++] = mod_sub(ETA, t0);
+      }
+      if (done < DEGREE && constant_time_declassify_int(t1 < 9)) {
+        out->c[done++] = mod_sub(ETA, t1);
+      }
+    }
+  }
+}
+
+// FIPS 204, Algorithm 34 (`ExpandMask`), but just a single step.
+static void scalar_sample_mask(
+    scalar *out, const uint8_t derived_seed[RHO_PRIME_BYTES + 2]) {
+  uint8_t buf[640];
+  BORINGSSL_keccak(buf, sizeof(buf), derived_seed, RHO_PRIME_BYTES + 2,
+                   boringssl_shake256);
+
+  scalar_decode_signed_20_19(out, buf);
+}
+
+// FIPS 204, Algorithm 29 (`SampleInBall`).
+static void scalar_sample_in_ball_vartime(scalar *out, const uint8_t *seed,
+                                          int len) {
+  assert(len == 2 * LAMBDA_BYTES);
+
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, seed, len);
+  assert(keccak_ctx.squeeze_offset == 0);
+  assert(keccak_ctx.rate_bytes == 136);
+
+  uint8_t block[136];
+  BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+
+  uint64_t signs = CRYPTO_load_u64_le(block);
+  int offset = 8;
+  // SampleInBall implements a Fisher–Yates shuffle, which unavoidably leaks
+  // where the zeros are by memory access pattern. Although this leak happens
+  // before bad signatures are rejected, this is safe. See
+  // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/8d8f01ac_70af3f21/
+  CONSTTIME_DECLASSIFY(block + offset, sizeof(block) - offset);
+
+  OPENSSL_memset(out, 0, sizeof(*out));
+  for (size_t i = DEGREE - TAU; i < DEGREE; i++) {
+    size_t byte;
+    for (;;) {
+      if (offset == 136) {
+        BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block));
+        // See above.
+        CONSTTIME_DECLASSIFY(block, sizeof(block));
+        offset = 0;
+      }
+
+      byte = block[offset++];
+      if (byte <= i) {
+        break;
+      }
+    }
+
+    out->c[i] = out->c[byte];
+    out->c[byte] = mod_sub(1, 2 * (signs & 1));
+    signs >>= 1;
+  }
+}
+
+// FIPS 204, Algorithm 32 (`ExpandA`).
+static void matrix_expand(matrix *out, const uint8_t rho[RHO_BYTES]) {
+  static_assert(K <= 0x100, "K must fit in 8 bits");
+  static_assert(L <= 0x100, "L must fit in 8 bits");
+
+  uint8_t derived_seed[RHO_BYTES + 2];
+  OPENSSL_memcpy(derived_seed, rho, RHO_BYTES);
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < L; j++) {
+      derived_seed[RHO_BYTES + 1] = (uint8_t)i;
+      derived_seed[RHO_BYTES] = (uint8_t)j;
+      scalar_from_keccak_vartime(&out->v[i][j], derived_seed);
+    }
+  }
+}
+
+// FIPS 204, Algorithm 33 (`ExpandS`).
+static void vector_expand_short(vectorl *s1, vectork *s2,
+                                const uint8_t sigma[SIGMA_BYTES]) {
+  static_assert(K <= 0x100, "K must fit in 8 bits");
+  static_assert(L <= 0x100, "L must fit in 8 bits");
+  static_assert(K + L <= 0x100, "K+L must fit in 8 bits");
+
+  uint8_t derived_seed[SIGMA_BYTES + 2];
+  OPENSSL_memcpy(derived_seed, sigma, SIGMA_BYTES);
+  derived_seed[SIGMA_BYTES] = 0;
+  derived_seed[SIGMA_BYTES + 1] = 0;
+  for (int i = 0; i < L; i++) {
+    scalar_uniform_eta_4(&s1->v[i], derived_seed);
+    ++derived_seed[SIGMA_BYTES];
+  }
+  for (int i = 0; i < K; i++) {
+    scalar_uniform_eta_4(&s2->v[i], derived_seed);
+    ++derived_seed[SIGMA_BYTES];
+  }
+}
+
+// FIPS 204, Algorithm 34 (`ExpandMask`).
+static void vectorl_expand_mask(vectorl *out,
+                                const uint8_t seed[RHO_PRIME_BYTES],
+                                size_t kappa) {
+  assert(kappa + L <= 0x10000);
+
+  uint8_t derived_seed[RHO_PRIME_BYTES + 2];
+  OPENSSL_memcpy(derived_seed, seed, RHO_PRIME_BYTES);
+  for (int i = 0; i < L; i++) {
+    size_t index = kappa + i;
+    derived_seed[RHO_PRIME_BYTES] = index & 0xFF;
+    derived_seed[RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF;
+    scalar_sample_mask(&out->v[i], derived_seed);
+  }
+}
+
+/* Encoding */
+
+// FIPS 204, Algorithm 16 (`SimpleBitPack`).
+//
+// Encodes an entire vector into 32*K*|bits| bytes. Note that since 256 (DEGREE)
+// is divisible by 8, the individual vector entries will always fill a whole
+// number of bytes, so we do not need to worry about bit packing here.
+static void vectork_encode(uint8_t *out, const vectork *a, int bits) {
+  if (bits == 4) {
+    for (int i = 0; i < K; i++) {
+      scalar_encode_4(out + i * bits * DEGREE / 8, &a->v[i]);
+    }
+  } else {
+    assert(bits == 10);
+    for (int i = 0; i < K; i++) {
+      scalar_encode_10(out + i * bits * DEGREE / 8, &a->v[i]);
+    }
+  }
+}
+
+// FIPS 204, Algorithm 18 (`SimpleBitUnpack`).
+static void vectork_decode_10(vectork *out, const uint8_t *in) {
+  for (int i = 0; i < K; i++) {
+    scalar_decode_10(&out->v[i], in + i * 10 * DEGREE / 8);
+  }
+}
+
+static void vectork_encode_signed(uint8_t *out, const vectork *a, int bits,
+                                  uint32_t max) {
+  for (int i = 0; i < K; i++) {
+    scalar_encode_signed(out + i * bits * DEGREE / 8, &a->v[i], bits, max);
+  }
+}
+
+static int vectork_decode_signed(vectork *out, const uint8_t *in, int bits,
+                                 uint32_t max) {
+  for (int i = 0; i < K; i++) {
+    if (!scalar_decode_signed(&out->v[i], in + i * bits * DEGREE / 8, bits,
+                              max)) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+// FIPS 204, Algorithm 17 (`BitPack`).
+//
+// Encodes an entire vector into 32*L*|bits| bytes. Note that since 256 (DEGREE)
+// is divisible by 8, the individual vector entries will always fill a whole
+// number of bytes, so we do not need to worry about bit packing here.
+static void vectorl_encode_signed(uint8_t *out, const vectorl *a, int bits,
+                                  uint32_t max) {
+  for (int i = 0; i < L; i++) {
+    scalar_encode_signed(out + i * bits * DEGREE / 8, &a->v[i], bits, max);
+  }
+}
+
+static int vectorl_decode_signed(vectorl *out, const uint8_t *in, int bits,
+                                 uint32_t max) {
+  for (int i = 0; i < L; i++) {
+    if (!scalar_decode_signed(&out->v[i], in + i * bits * DEGREE / 8, bits,
+                              max)) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+// FIPS 204, Algorithm 28 (`w1Encode`).
+static void w1_encode(uint8_t out[128 * K], const vectork *w1) {
+  vectork_encode(out, w1, 4);
+}
+
+// FIPS 204, Algorithm 20 (`HintBitPack`).
+static void hint_bit_pack(uint8_t out[OMEGA + K], const vectork *h) {
+  OPENSSL_memset(out, 0, OMEGA + K);
+  int index = 0;
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < DEGREE; j++) {
+      if (h->v[i].c[j]) {
+        // h must have at most OMEGA non-zero coefficients.
+        BSSL_CHECK(index < OMEGA);
+        out[index++] = j;
+      }
+    }
+    out[OMEGA + i] = index;
+  }
+}
+
+// FIPS 204, Algorithm 21 (`HintBitUnpack`).
+static int hint_bit_unpack(vectork *h, const uint8_t in[OMEGA + K]) {
+  vectork_zero(h);
+  int index = 0;
+  for (int i = 0; i < K; i++) {
+    const int limit = in[OMEGA + i];
+    if (limit < index || limit > OMEGA) {
+      return 0;
+    }
+
+    int last = -1;
+    while (index < limit) {
+      int byte = in[index++];
+      if (last >= 0 && byte <= last) {
+        return 0;
+      }
+      last = byte;
+      static_assert(DEGREE == 256,
+                    "DEGREE must be 256 for this write to be in bounds");
+      h->v[i].c[byte] = 1;
+    }
+  }
+  for (; index < OMEGA; index++) {
+    if (in[index] != 0) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+struct public_key {
+  uint8_t rho[RHO_BYTES];
+  vectork t1;
+  // Pre-cached value(s).
+  uint8_t public_key_hash[TR_BYTES];
+};
+
+struct private_key {
+  uint8_t rho[RHO_BYTES];
+  uint8_t k[K_BYTES];
+  uint8_t public_key_hash[TR_BYTES];
+  vectorl s1;
+  vectork s2;
+  vectork t0;
+};
+
+struct signature {
+  uint8_t c_tilde[2 * LAMBDA_BYTES];
+  vectorl z;
+  vectork h;
+};
+
+// FIPS 204, Algorithm 22 (`pkEncode`).
+static int mldsa_marshal_public_key(CBB *out, const struct public_key *pub) {
+  if (!CBB_add_bytes(out, pub->rho, sizeof(pub->rho))) {
+    return 0;
+  }
+
+  uint8_t *vectork_output;
+  if (!CBB_add_space(out, &vectork_output, 320 * K)) {
+    return 0;
+  }
+  vectork_encode(vectork_output, &pub->t1, 10);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 23 (`pkDecode`).
+static int mldsa_parse_public_key(struct public_key *pub, CBS *in) {
+  if (!CBS_copy_bytes(in, pub->rho, sizeof(pub->rho))) {
+    return 0;
+  }
+
+  CBS t1_bytes;
+  if (!CBS_get_bytes(in, &t1_bytes, 320 * K)) {
+    return 0;
+  }
+  vectork_decode_10(&pub->t1, CBS_data(&t1_bytes));
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 24 (`skEncode`).
+static int mldsa_marshal_private_key(CBB *out, const struct private_key *priv) {
+  if (!CBB_add_bytes(out, priv->rho, sizeof(priv->rho)) ||
+      !CBB_add_bytes(out, priv->k, sizeof(priv->k)) ||
+      !CBB_add_bytes(out, priv->public_key_hash,
+                     sizeof(priv->public_key_hash))) {
+    return 0;
+  }
+
+  uint8_t *vectorl_output;
+  if (!CBB_add_space(out, &vectorl_output, 128 * L)) {
+    return 0;
+  }
+  vectorl_encode_signed(vectorl_output, &priv->s1, 4, ETA);
+
+  uint8_t *vectork_output;
+  if (!CBB_add_space(out, &vectork_output, 128 * K)) {
+    return 0;
+  }
+  vectork_encode_signed(vectork_output, &priv->s2, 4, ETA);
+
+  if (!CBB_add_space(out, &vectork_output, 416 * K)) {
+    return 0;
+  }
+  vectork_encode_signed(vectork_output, &priv->t0, 13, 1 << 12);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 25 (`skDecode`).
+static int mldsa_parse_private_key(struct private_key *priv, CBS *in) {
+  CBS s1_bytes;
+  CBS s2_bytes;
+  CBS t0_bytes;
+  if (!CBS_copy_bytes(in, priv->rho, sizeof(priv->rho)) ||
+      !CBS_copy_bytes(in, priv->k, sizeof(priv->k)) ||
+      !CBS_copy_bytes(in, priv->public_key_hash,
+                      sizeof(priv->public_key_hash)) ||
+      !CBS_get_bytes(in, &s1_bytes, 128 * L) ||
+      !vectorl_decode_signed(&priv->s1, CBS_data(&s1_bytes), 4, ETA) ||
+      !CBS_get_bytes(in, &s2_bytes, 128 * K) ||
+      !vectork_decode_signed(&priv->s2, CBS_data(&s2_bytes), 4, ETA) ||
+      !CBS_get_bytes(in, &t0_bytes, 416 * K) ||
+      // Note: Decoding 13 bits into (-2^12, 2^12] cannot fail.
+      !vectork_decode_signed(&priv->t0, CBS_data(&t0_bytes), 13, 1 << 12)) {
+    return 0;
+  }
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 26 (`sigEncode`).
+static int mldsa_marshal_signature(CBB *out, const struct signature *sign) {
+  if (!CBB_add_bytes(out, sign->c_tilde, sizeof(sign->c_tilde))) {
+    return 0;
+  }
+
+  uint8_t *vectorl_output;
+  if (!CBB_add_space(out, &vectorl_output, 640 * L)) {
+    return 0;
+  }
+  vectorl_encode_signed(vectorl_output, &sign->z, 20, 1 << 19);
+
+  uint8_t *hint_output;
+  if (!CBB_add_space(out, &hint_output, OMEGA + K)) {
+    return 0;
+  }
+  hint_bit_pack(hint_output, &sign->h);
+
+  return 1;
+}
+
+// FIPS 204, Algorithm 27 (`sigDecode`).
+static int mldsa_parse_signature(struct signature *sign, CBS *in) {
+  CBS z_bytes;
+  CBS hint_bytes;
+  if (!CBS_copy_bytes(in, sign->c_tilde, sizeof(sign->c_tilde)) ||
+      !CBS_get_bytes(in, &z_bytes, 640 * L) ||
+      // Note: Decoding 20 bits into (-2^19, 2^19] cannot fail.
+      !vectorl_decode_signed(&sign->z, CBS_data(&z_bytes), 20, 1 << 19) ||
+      !CBS_get_bytes(in, &hint_bytes, OMEGA + K) ||
+      !hint_bit_unpack(&sign->h, CBS_data(&hint_bytes))) {
+    return 0;
+  };
+
+  return 1;
+}
+
+static struct private_key *private_key_from_external(
+    const struct MLDSA65_private_key *external) {
+  static_assert(
+      sizeof(struct MLDSA65_private_key) == sizeof(struct private_key),
+      "Kyber private key size incorrect");
+  static_assert(
+      alignof(struct MLDSA65_private_key) == alignof(struct private_key),
+      "Kyber private key align incorrect");
+  return (struct private_key *)external;
+}
+
+static struct public_key *public_key_from_external(
+    const struct MLDSA65_public_key *external) {
+  static_assert(sizeof(struct MLDSA65_public_key) == sizeof(struct public_key),
+                "mldsa public key size incorrect");
+  static_assert(
+      alignof(struct MLDSA65_public_key) == alignof(struct public_key),
+      "mldsa public key align incorrect");
+  return (struct public_key *)external;
+}
+
+/* API */
+
+// Calls |MLDSA_generate_key_external_entropy| with random bytes from
+// |RAND_bytes|. Returns 1 on success and 0 on failure.
+int MLDSA65_generate_key(
+    uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES],
+    uint8_t out_seed[MLDSA_SEED_BYTES],
+    struct MLDSA65_private_key *out_private_key) {
+  RAND_bytes(out_seed, MLDSA_SEED_BYTES);
+  return MLDSA65_generate_key_external_entropy(out_encoded_public_key,
+                                               out_private_key, out_seed);
+}
+
+int MLDSA65_private_key_from_seed(struct MLDSA65_private_key *out_private_key,
+                                  const uint8_t *seed, size_t seed_len) {
+  if (seed_len != MLDSA_SEED_BYTES) {
+    return 0;
+  }
+  uint8_t public_key[MLDSA65_PUBLIC_KEY_BYTES];
+  return MLDSA65_generate_key_external_entropy(public_key, out_private_key,
+                                               seed);
+}
+
+// FIPS 204, Algorithm 6 (`ML-DSA.KeyGen_internal`). Returns 1 on success and 0
+// on failure.
+int MLDSA65_generate_key_external_entropy(
+    uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES],
+    struct MLDSA65_private_key *out_private_key,
+    const uint8_t entropy[MLDSA_SEED_BYTES]) {
+  int ret = 0;
+
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    struct public_key pub;
+    matrix a_ntt;
+    vectorl s1_ntt;
+    vectork t;
+  };
+  struct values_st *values = OPENSSL_malloc(sizeof(*values));
+  if (values == NULL) {
+    goto err;
+  }
+
+  struct private_key *priv = private_key_from_external(out_private_key);
+
+  uint8_t augmented_entropy[MLDSA_SEED_BYTES + 2];
+  OPENSSL_memcpy(augmented_entropy, entropy, MLDSA_SEED_BYTES);
+  // The k and l parameters are appended to the seed.
+  augmented_entropy[MLDSA_SEED_BYTES] = K;
+  augmented_entropy[MLDSA_SEED_BYTES + 1] = L;
+  uint8_t expanded_seed[RHO_BYTES + SIGMA_BYTES + K_BYTES];
+  BORINGSSL_keccak(expanded_seed, sizeof(expanded_seed), augmented_entropy,
+                   sizeof(augmented_entropy), boringssl_shake256);
+  const uint8_t *const rho = expanded_seed;
+  const uint8_t *const sigma = expanded_seed + RHO_BYTES;
+  const uint8_t *const k = expanded_seed + RHO_BYTES + SIGMA_BYTES;
+  // rho is public.
+  CONSTTIME_DECLASSIFY(rho, RHO_BYTES);
+  OPENSSL_memcpy(values->pub.rho, rho, sizeof(values->pub.rho));
+  OPENSSL_memcpy(priv->rho, rho, sizeof(priv->rho));
+  OPENSSL_memcpy(priv->k, k, sizeof(priv->k));
+
+  matrix_expand(&values->a_ntt, rho);
+  vector_expand_short(&priv->s1, &priv->s2, sigma);
+
+  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
+  vectorl_ntt(&values->s1_ntt);
+
+  matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt);
+  vectork_inverse_ntt(&values->t);
+  vectork_add(&values->t, &values->t, &priv->s2);
+
+  vectork_power2_round(&values->pub.t1, &priv->t0, &values->t);
+  // t1 is public.
+  CONSTTIME_DECLASSIFY(&values->pub.t1, sizeof(values->pub.t1));
+
+  CBB cbb;
+  CBB_init_fixed(&cbb, out_encoded_public_key, MLDSA65_PUBLIC_KEY_BYTES);
+  if (!mldsa_marshal_public_key(&cbb, &values->pub)) {
+    goto err;
+  }
+  assert(CBB_len(&cbb) == MLDSA65_PUBLIC_KEY_BYTES);
+
+  BORINGSSL_keccak(priv->public_key_hash, sizeof(priv->public_key_hash),
+                   out_encoded_public_key, MLDSA65_PUBLIC_KEY_BYTES,
+                   boringssl_shake256);
+
+  ret = 1;
+err:
+  OPENSSL_free(values);
+  return ret;
+}
+
+int MLDSA65_public_from_private(struct MLDSA65_public_key *out_public_key,
+                                const struct MLDSA65_private_key *private_key) {
+  int ret = 0;
+
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    matrix a_ntt;
+    vectorl s1_ntt;
+    vectork t;
+    vectork t0;
+  };
+  struct values_st *values = OPENSSL_malloc(sizeof(*values));
+  if (values == NULL) {
+    goto err;
+  }
+
+  const struct private_key *priv = private_key_from_external(private_key);
+  struct public_key *pub = public_key_from_external(out_public_key);
+
+  OPENSSL_memcpy(pub->rho, priv->rho, sizeof(pub->rho));
+  OPENSSL_memcpy(pub->public_key_hash, priv->public_key_hash,
+                 sizeof(pub->public_key_hash));
+
+  matrix_expand(&values->a_ntt, priv->rho);
+
+  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
+  vectorl_ntt(&values->s1_ntt);
+
+  matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt);
+  vectork_inverse_ntt(&values->t);
+  vectork_add(&values->t, &values->t, &priv->s2);
+
+  vectork_power2_round(&pub->t1, &values->t0, &values->t);
+
+  ret = 1;
+err:
+  OPENSSL_free(values);
+  return ret;
+}
+
+// FIPS 204, Algorithm 7 (`ML-DSA.Sign_internal`). Returns 1 on success and 0 on
+// failure.
+int MLDSA65_sign_internal(
+    uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES],
+    const struct MLDSA65_private_key *private_key, const uint8_t *msg,
+    size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len,
+    const uint8_t *context, size_t context_len,
+    const uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES]) {
+  int ret = 0;
+  const struct private_key *priv = private_key_from_external(private_key);
+
+  uint8_t mu[MU_BYTES];
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, priv->public_key_hash,
+                          sizeof(priv->public_key_hash));
+  BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len);
+  BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len);
+  BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, mu, MU_BYTES);
+
+  uint8_t rho_prime[RHO_PRIME_BYTES];
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, priv->k, sizeof(priv->k));
+  BORINGSSL_keccak_absorb(&keccak_ctx, randomizer,
+                          MLDSA_SIGNATURE_RANDOMIZER_BYTES);
+  BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, rho_prime, RHO_PRIME_BYTES);
+
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    struct signature sign;
+    vectorl s1_ntt;
+    vectork s2_ntt;
+    vectork t0_ntt;
+    matrix a_ntt;
+    vectorl y;
+    vectork w;
+    vectork w1;
+    vectorl cs1;
+    vectork cs2;
+  };
+  struct values_st *values = OPENSSL_malloc(sizeof(*values));
+  if (values == NULL) {
+    goto err;
+  }
+  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
+  vectorl_ntt(&values->s1_ntt);
+
+  OPENSSL_memcpy(&values->s2_ntt, &priv->s2, sizeof(values->s2_ntt));
+  vectork_ntt(&values->s2_ntt);
+
+  OPENSSL_memcpy(&values->t0_ntt, &priv->t0, sizeof(values->t0_ntt));
+  vectork_ntt(&values->t0_ntt);
+
+  matrix_expand(&values->a_ntt, priv->rho);
+
+  // kappa must not exceed 2**16/L = 13107. But the probability of it exceeding
+  // even 1000 iterations is vanishingly small.
+  for (size_t kappa = 0;; kappa += L) {
+    vectorl_expand_mask(&values->y, rho_prime, kappa);
+
+    vectorl *y_ntt = &values->cs1;
+    OPENSSL_memcpy(y_ntt, &values->y, sizeof(*y_ntt));
+    vectorl_ntt(y_ntt);
+
+    matrix_mult(&values->w, &values->a_ntt, y_ntt);
+    vectork_inverse_ntt(&values->w);
+
+    vectork_high_bits(&values->w1, &values->w);
+    uint8_t w1_encoded[128 * K];
+    w1_encode(w1_encoded, &values->w1);
+
+    BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+    BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES);
+    BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K);
+    BORINGSSL_keccak_squeeze(&keccak_ctx, values->sign.c_tilde,
+                             2 * LAMBDA_BYTES);
+
+    scalar c_ntt;
+    scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde,
+                                  sizeof(values->sign.c_tilde));
+    scalar_ntt(&c_ntt);
+
+    vectorl_mult_scalar(&values->cs1, &values->s1_ntt, &c_ntt);
+    vectorl_inverse_ntt(&values->cs1);
+    vectork_mult_scalar(&values->cs2, &values->s2_ntt, &c_ntt);
+    vectork_inverse_ntt(&values->cs2);
+
+    vectorl_add(&values->sign.z, &values->y, &values->cs1);
+
+    vectork *r0 = &values->w1;
+    vectork_sub(r0, &values->w, &values->cs2);
+    vectork_low_bits(r0, r0);
+
+    // Leaking the fact that a signature was rejected is fine as the next
+    // attempt at a signature will be (indistinguishable from) independent of
+    // this one. Note, however, that we additionally leak which of the two
+    // branches rejected the signature. Section 5.5 of
+    // https://pq-crystals.org/dilithium/data/dilithium-specification-round3.pdf
+    // describes this leak as OK. Note we leak less than what is described by
+    // the paper; we do not reveal which coefficient violated the bound, and we
+    // hide which of the |z_max| or |r0_max| bound failed. See also
+    // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/2bbab0fa_d241d35a/
+    uint32_t z_max = vectorl_max(&values->sign.z);
+    uint32_t r0_max = vectork_max_signed(r0);
+    if (constant_time_declassify_w(
+            constant_time_ge_w(z_max, kGamma1 - BETA) |
+            constant_time_ge_w(r0_max, kGamma2 - BETA))) {
+      continue;
+    }
+
+    vectork *ct0 = &values->w1;
+    vectork_mult_scalar(ct0, &values->t0_ntt, &c_ntt);
+    vectork_inverse_ntt(ct0);
+    vectork_make_hint(&values->sign.h, ct0, &values->cs2, &values->w);
+
+    // See above.
+    uint32_t ct0_max = vectork_max(ct0);
+    size_t h_ones = vectork_count_ones(&values->sign.h);
+    if (constant_time_declassify_w(constant_time_ge_w(ct0_max, kGamma2) |
+                                   constant_time_lt_w(OMEGA, h_ones))) {
+      continue;
+    }
+
+    // Although computed with the private key, the signature is public.
+    CONSTTIME_DECLASSIFY(values->sign.c_tilde, sizeof(values->sign.c_tilde));
+    CONSTTIME_DECLASSIFY(&values->sign.z, sizeof(values->sign.z));
+    CONSTTIME_DECLASSIFY(&values->sign.h, sizeof(values->sign.h));
+
+    CBB cbb;
+    CBB_init_fixed(&cbb, out_encoded_signature, MLDSA65_SIGNATURE_BYTES);
+    if (!mldsa_marshal_signature(&cbb, &values->sign)) {
+      goto err;
+    }
+
+    BSSL_CHECK(CBB_len(&cbb) == MLDSA65_SIGNATURE_BYTES);
+    ret = 1;
+    break;
+  }
+
+err:
+  OPENSSL_free(values);
+  return ret;
+}
+
+// mldsa signature in randomized mode, filling the random bytes with
+// |RAND_bytes|. Returns 1 on success and 0 on failure.
+int MLDSA65_sign(uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES],
+                 const struct MLDSA65_private_key *private_key,
+                 const uint8_t *msg, size_t msg_len, const uint8_t *context,
+                 size_t context_len) {
+  if (context_len > 255) {
+    return 0;
+  }
+
+  uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES];
+  RAND_bytes(randomizer, sizeof(randomizer));
+
+  const uint8_t context_prefix[2] = {0, context_len};
+  return MLDSA65_sign_internal(out_encoded_signature, private_key, msg, msg_len,
+                               context_prefix, sizeof(context_prefix), context,
+                               context_len, randomizer);
+}
+
+// FIPS 204, Algorithm 3 (`ML-DSA.Verify`).
+int MLDSA65_verify(const struct MLDSA65_public_key *public_key,
+                   const uint8_t *signature, size_t signature_len,
+                   const uint8_t *msg, size_t msg_len, const uint8_t *context,
+                   size_t context_len) {
+  if (context_len > 255 || signature_len != MLDSA65_SIGNATURE_BYTES) {
+    return 0;
+  }
+
+  const uint8_t context_prefix[2] = {0, context_len};
+  return MLDSA65_verify_internal(public_key, signature, msg, msg_len,
+                                 context_prefix, sizeof(context_prefix),
+                                 context, context_len);
+}
+
+// FIPS 204, Algorithm 8 (`ML-DSA.Verify_internal`).
+int MLDSA65_verify_internal(
+    const struct MLDSA65_public_key *public_key,
+    const uint8_t encoded_signature[MLDSA65_SIGNATURE_BYTES],
+    const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix,
+    size_t context_prefix_len, const uint8_t *context, size_t context_len) {
+  int ret = 0;
+
+  // Intermediate values, allocated on the heap to allow use when there is a
+  // limited amount of stack.
+  struct values_st {
+    struct signature sign;
+    matrix a_ntt;
+    vectorl z_ntt;
+    vectork az_ntt;
+    vectork ct1_ntt;
+  };
+  struct values_st *values = OPENSSL_malloc(sizeof(*values));
+  if (values == NULL) {
+    goto err;
+  }
+
+  const struct public_key *pub = public_key_from_external(public_key);
+
+  CBS cbs;
+  CBS_init(&cbs, encoded_signature, MLDSA65_SIGNATURE_BYTES);
+  if (!mldsa_parse_signature(&values->sign, &cbs)) {
+    goto err;
+  }
+
+  matrix_expand(&values->a_ntt, pub->rho);
+
+  uint8_t mu[MU_BYTES];
+  struct BORINGSSL_keccak_st keccak_ctx;
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, pub->public_key_hash,
+                          sizeof(pub->public_key_hash));
+  BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len);
+  BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len);
+  BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, mu, MU_BYTES);
+
+  scalar c_ntt;
+  scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde,
+                                sizeof(values->sign.c_tilde));
+  scalar_ntt(&c_ntt);
+
+  OPENSSL_memcpy(&values->z_ntt, &values->sign.z, sizeof(values->z_ntt));
+  vectorl_ntt(&values->z_ntt);
+
+  matrix_mult(&values->az_ntt, &values->a_ntt, &values->z_ntt);
+
+  vectork_scale_power2_round(&values->ct1_ntt, &pub->t1);
+  vectork_ntt(&values->ct1_ntt);
+
+  vectork_mult_scalar(&values->ct1_ntt, &values->ct1_ntt, &c_ntt);
+
+  vectork *const w1 = &values->az_ntt;
+  vectork_sub(w1, &values->az_ntt, &values->ct1_ntt);
+  vectork_inverse_ntt(w1);
+
+  vectork_use_hint_vartime(w1, &values->sign.h, w1);
+  uint8_t w1_encoded[128 * K];
+  w1_encode(w1_encoded, w1);
+
+  uint8_t c_tilde[2 * LAMBDA_BYTES];
+  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
+  BORINGSSL_keccak_absorb(&keccak_ctx, mu, MU_BYTES);
+  BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K);
+  BORINGSSL_keccak_squeeze(&keccak_ctx, c_tilde, 2 * LAMBDA_BYTES);
+
+  uint32_t z_max = vectorl_max(&values->sign.z);
+  if (z_max < kGamma1 - BETA &&
+      OPENSSL_memcmp(c_tilde, values->sign.c_tilde, 2 * LAMBDA_BYTES) == 0) {
+    ret = 1;
+  }
+
+err:
+  OPENSSL_free(values);
+  return ret;
+}
+
+/* Serialization of keys. */
+
+int MLDSA65_marshal_public_key(CBB *out,
+                               const struct MLDSA65_public_key *public_key) {
+  return mldsa_marshal_public_key(out, public_key_from_external(public_key));
+}
+
+int MLDSA65_parse_public_key(struct MLDSA65_public_key *public_key, CBS *in) {
+  struct public_key *pub = public_key_from_external(public_key);
+  CBS orig_in = *in;
+  if (!mldsa_parse_public_key(pub, in) || CBS_len(in) != 0) {
+    return 0;
+  }
+
+  // Compute pre-cached values.
+  BORINGSSL_keccak(pub->public_key_hash, sizeof(pub->public_key_hash),
+                   CBS_data(&orig_in), CBS_len(&orig_in), boringssl_shake256);
+  return 1;
+}
+
+int MLDSA65_marshal_private_key(CBB *out,
+                                const struct MLDSA65_private_key *private_key) {
+  return mldsa_marshal_private_key(out, private_key_from_external(private_key));
+}
+
+int MLDSA65_parse_private_key(struct MLDSA65_private_key *private_key,
+                              CBS *in) {
+  struct private_key *priv = private_key_from_external(private_key);
+  return mldsa_parse_private_key(priv, in) && CBS_len(in) == 0;
+}
diff --git a/Sources/CCryptoBoringSSL/crypto/mlkem/internal.h b/Sources/CCryptoBoringSSL/crypto/mlkem/internal.h
new file mode 100644
index 00000000..c3018798
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/mlkem/internal.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2023, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_CRYPTO_MLKEM_INTERNAL_H
+#define OPENSSL_HEADER_CRYPTO_MLKEM_INTERNAL_H
+
+#include <CCryptoBoringSSL_base.h>
+#include <CCryptoBoringSSL_mlkem.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+// MLKEM_ENCAP_ENTROPY is the number of bytes of uniformly random entropy
+// necessary to encapsulate a secret. The entropy will be leaked to the
+// decapsulating party.
+#define MLKEM_ENCAP_ENTROPY 32
+
+// MLKEM768_generate_key_external_seed is a deterministic function to create a
+// pair of ML-KEM-768 keys, using the supplied seed. The seed needs to be
+// uniformly random. This function is should only be used for tests, regular
+// callers should use the non-deterministic |MLKEM768_generate_key| directly.
+OPENSSL_EXPORT void MLKEM768_generate_key_external_seed(
+    uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
+    struct MLKEM768_private_key *out_private_key,
+    const uint8_t seed[MLKEM_SEED_BYTES]);
+
+// MLKEM768_encap_external_entropy behaves like |MLKEM768_encap|, but uses
+// |MLKEM_ENCAP_ENTROPY| bytes of |entropy| for randomization. The decapsulating
+// side will be able to recover |entropy| in full. This function should only be
+// used for tests, regular callers should use the non-deterministic
+// |MLKEM768_encap| directly.
+OPENSSL_EXPORT void MLKEM768_encap_external_entropy(
+    uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const struct MLKEM768_public_key *public_key,
+    const uint8_t entropy[MLKEM_ENCAP_ENTROPY]);
+
+// MLKEM768_marshal_private_key serializes |private_key| to |out| in the
+// NIST format for ML-KEM-768 private keys. It returns one on success or
+// zero on allocation error. (Note that one can also save just the seed value
+// produced by |MLKEM768_generate_key|, which is significantly smaller.)
+OPENSSL_EXPORT int MLKEM768_marshal_private_key(
+    CBB *out, const struct MLKEM768_private_key *private_key);
+
+// MLKEM1024_generate_key_external_seed is a deterministic function to create a
+// pair of ML-KEM-1024 keys, using the supplied seed. The seed needs to be
+// uniformly random. This function is should only be used for tests, regular
+// callers should use the non-deterministic |MLKEM1024_generate_key| directly.
+OPENSSL_EXPORT void MLKEM1024_generate_key_external_seed(
+    uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES],
+    struct MLKEM1024_private_key *out_private_key,
+    const uint8_t seed[MLKEM_SEED_BYTES]);
+
+// MLKEM1024_encap_external_entropy behaves like |MLKEM1024_encap|, but uses
+// |MLKEM_ENCAP_ENTROPY| bytes of |entropy| for randomization. The
+// decapsulating side will be able to recover |entropy| in full. This function
+// should only be used for tests, regular callers should use the
+// non-deterministic |MLKEM1024_encap| directly.
+OPENSSL_EXPORT void MLKEM1024_encap_external_entropy(
+    uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES],
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const struct MLKEM1024_public_key *public_key,
+    const uint8_t entropy[MLKEM_ENCAP_ENTROPY]);
+
+// MLKEM1024_marshal_private_key serializes |private_key| to |out| in the
+// NIST format for ML-KEM-1024 private keys. It returns one on success or
+// zero on allocation error. (Note that one can also save just the seed value
+// produced by |MLKEM1024_generate_key|, which is significantly smaller.)
+OPENSSL_EXPORT int MLKEM1024_marshal_private_key(
+    CBB *out, const struct MLKEM1024_private_key *private_key);
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // OPENSSL_HEADER_CRYPTO_MLKEM_INTERNAL_H
diff --git a/Sources/CCryptoBoringSSL/crypto/obj/obj_dat.h b/Sources/CCryptoBoringSSL/crypto/obj/obj_dat.h
index 71ef2d2b..f1b70639 100644
--- a/Sources/CCryptoBoringSSL/crypto/obj/obj_dat.h
+++ b/Sources/CCryptoBoringSSL/crypto/obj/obj_dat.h
@@ -57,7 +57,7 @@
 /* This file is generated by crypto/obj/objects.go. */
 
 
-#define NUM_NID 965
+#define NUM_NID 966
 
 static const uint8_t kObjectData[] = {
     /* NID_rsadsi */
@@ -8783,6 +8783,7 @@ static const ASN1_OBJECT kObjects[NUM_NID] = {
     {"HKDF", "hkdf", NID_hkdf, 0, NULL, 0},
     {"X25519Kyber768Draft00", "X25519Kyber768Draft00",
      NID_X25519Kyber768Draft00, 0, NULL, 0},
+    {"X25519MLKEM768", "X25519MLKEM768", NID_X25519MLKEM768, 0, NULL, 0},
 };
 
 static const uint16_t kNIDsInShortNameOrder[] = {
@@ -8981,6 +8982,7 @@ static const uint16_t kNIDsInShortNameOrder[] = {
     458 /* UID */,
     948 /* X25519 */,
     964 /* X25519Kyber768Draft00 */,
+    965 /* X25519MLKEM768 */,
     961 /* X448 */,
     11 /* X500 */,
     378 /* X500algorithms */,
@@ -9852,6 +9854,7 @@ static const uint16_t kNIDsInLongNameOrder[] = {
     375 /* Trust Root */,
     948 /* X25519 */,
     964 /* X25519Kyber768Draft00 */,
+    965 /* X25519MLKEM768 */,
     961 /* X448 */,
     12 /* X509 */,
     402 /* X509v3 AC Targeting */,
diff --git a/Sources/CCryptoBoringSSL/crypto/pem/pem_lib.c b/Sources/CCryptoBoringSSL/crypto/pem/pem_lib.c
index 34806378..b291782e 100644
--- a/Sources/CCryptoBoringSSL/crypto/pem/pem_lib.c
+++ b/Sources/CCryptoBoringSSL/crypto/pem/pem_lib.c
@@ -312,12 +312,11 @@ int PEM_ASN1_write_bio(i2d_of_void *i2d, const char *name, BIO *bp, void *x,
     const unsigned iv_len = EVP_CIPHER_iv_length(enc);
 
     if (pass == NULL) {
-      pass_len = 0;
       if (!callback) {
         callback = PEM_def_callback;
       }
       pass_len = (*callback)(buf, PEM_BUFSIZE, 1, u);
-      if (pass_len <= 0) {
+      if (pass_len < 0) {
         OPENSSL_PUT_ERROR(PEM, PEM_R_READ_KEY);
         goto err;
       }
@@ -393,7 +392,7 @@ int PEM_do_header(EVP_CIPHER_INFO *cipher, unsigned char *data, long *plen,
     callback = PEM_def_callback;
   }
   pass_len = callback(buf, PEM_BUFSIZE, 0, u);
-  if (pass_len <= 0) {
+  if (pass_len < 0) {
     OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ);
     return 0;
   }
@@ -779,11 +778,11 @@ int PEM_read_bio(BIO *bp, char **name, char **header, unsigned char **data,
 
 int PEM_def_callback(char *buf, int size, int rwflag, void *userdata) {
   if (!buf || !userdata || size < 0) {
-    return 0;
+    return -1;
   }
   size_t len = strlen((char *)userdata);
   if (len >= (size_t)size) {
-    return 0;
+    return -1;
   }
   OPENSSL_strlcpy(buf, userdata, (size_t)size);
   return (int)len;
diff --git a/Sources/CCryptoBoringSSL/crypto/pem/pem_pk8.c b/Sources/CCryptoBoringSSL/crypto/pem/pem_pk8.c
index aed4905f..415b45bd 100644
--- a/Sources/CCryptoBoringSSL/crypto/pem/pem_pk8.c
+++ b/Sources/CCryptoBoringSSL/crypto/pem/pem_pk8.c
@@ -113,12 +113,11 @@ static int do_pk8pkey(BIO *bp, const EVP_PKEY *x, int isder, int nid,
   }
   if (enc || (nid != -1)) {
     if (!pass) {
-      pass_len = 0;
       if (!cb) {
         cb = PEM_def_callback;
       }
       pass_len = cb(buf, PEM_BUFSIZE, 1, u);
-      if (pass_len <= 0) {
+      if (pass_len < 0) {
         OPENSSL_PUT_ERROR(PEM, PEM_R_READ_KEY);
         PKCS8_PRIV_KEY_INFO_free(p8inf);
         return 0;
@@ -166,7 +165,7 @@ EVP_PKEY *d2i_PKCS8PrivateKey_bio(BIO *bp, EVP_PKEY **x, pem_password_cb *cb,
     cb = PEM_def_callback;
   }
   pass_len = cb(psbuf, PEM_BUFSIZE, 0, u);
-  if (pass_len <= 0) {
+  if (pass_len < 0) {
     OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ);
     X509_SIG_free(p8);
     return NULL;
diff --git a/Sources/CCryptoBoringSSL/crypto/pem/pem_pkey.c b/Sources/CCryptoBoringSSL/crypto/pem/pem_pkey.c
index 225a9e0d..d34d9359 100644
--- a/Sources/CCryptoBoringSSL/crypto/pem/pem_pkey.c
+++ b/Sources/CCryptoBoringSSL/crypto/pem/pem_pkey.c
@@ -110,7 +110,7 @@ EVP_PKEY *PEM_read_bio_PrivateKey(BIO *bp, EVP_PKEY **x, pem_password_cb *cb,
       cb = PEM_def_callback;
     }
     pass_len = cb(psbuf, PEM_BUFSIZE, 0, u);
-    if (pass_len <= 0) {
+    if (pass_len < 0) {
       OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ);
       X509_SIG_free(p8);
       goto err;
diff --git a/Sources/CCryptoBoringSSL/crypto/pkcs8/internal.h b/Sources/CCryptoBoringSSL/crypto/pkcs8/internal.h
index 853e830f..20b62e4f 100644
--- a/Sources/CCryptoBoringSSL/crypto/pkcs8/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/pkcs8/internal.h
@@ -57,6 +57,7 @@
 #define OPENSSL_HEADER_PKCS8_INTERNAL_H
 
 #include <CCryptoBoringSSL_base.h>
+#include <CCryptoBoringSSL_stack.h>
 
 #if defined(__cplusplus)
 extern "C" {
diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/deterministic.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/deterministic.c
index 96b9b336..0c2c5a9d 100644
--- a/Sources/CCryptoBoringSSL/crypto/rand_extra/deterministic.c
+++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/deterministic.c
@@ -14,7 +14,8 @@
 
 #include <CCryptoBoringSSL_rand.h>
 
-#include "../fipsmodule/rand/internal.h"
+#include "../bcm_support.h"
+#include "sysrand_internal.h"
 
 #if defined(OPENSSL_RAND_DETERMINISTIC)
 
@@ -35,6 +36,8 @@ static CRYPTO_MUTEX g_num_calls_lock = CRYPTO_MUTEX_INIT;
 
 void RAND_reset_for_fuzzing(void) { g_num_calls = 0; }
 
+void CRYPTO_init_sysrand(void) {}
+
 void CRYPTO_sysrand(uint8_t *out, size_t requested) {
   static const uint8_t kZeroKey[32];
 
@@ -50,6 +53,11 @@ void CRYPTO_sysrand(uint8_t *out, size_t requested) {
   CRYPTO_chacha_20(out, out, requested, kZeroKey, nonce, 0);
 }
 
+int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) {
+  CRYPTO_sysrand(buf, len);
+  return 1;
+}
+
 void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) {
   CRYPTO_sysrand(out, requested);
 }
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/fork_detect.c
similarity index 81%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.c
rename to Sources/CCryptoBoringSSL/crypto/rand_extra/fork_detect.c
index b3e1506a..99be497a 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/fork_detect.c
+++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/fork_detect.c
@@ -16,8 +16,7 @@
 #define _GNU_SOURCE  // needed for madvise() and MAP_ANONYMOUS on Linux.
 #endif
 
-#include <CCryptoBoringSSL_base.h>
-#include "fork_detect.h"
+#include "../bcm_support.h"
 
 #if defined(OPENSSL_FORK_DETECTION_MADVISE)
 #include <unistd.h>
@@ -35,19 +34,18 @@ static_assert(MADV_WIPEONFORK == 18, "MADV_WIPEONFORK is not 18");
 #include <pthread.h>
 #endif // OPENSSL_FORK_DETECTION_MADVISE
 
-#include "../delocate.h"
-#include "../../internal.h"
+#include "../internal.h"
 
 #if defined(OPENSSL_FORK_DETECTION_MADVISE)
-DEFINE_BSS_GET(int, g_force_madv_wipeonfork);
-DEFINE_BSS_GET(int, g_force_madv_wipeonfork_enabled);
-DEFINE_STATIC_ONCE(g_fork_detect_once);
-DEFINE_STATIC_MUTEX(g_fork_detect_lock);
-DEFINE_BSS_GET(CRYPTO_atomic_u32 *, g_fork_detect_addr);
-DEFINE_BSS_GET(uint64_t, g_fork_generation);
+static int g_force_madv_wipeonfork;
+static int g_force_madv_wipeonfork_enabled;
+static CRYPTO_once_t g_fork_detect_once = CRYPTO_ONCE_INIT;
+static CRYPTO_MUTEX g_fork_detect_lock = CRYPTO_MUTEX_INIT;
+static CRYPTO_atomic_u32 * g_fork_detect_addr;
+static uint64_t g_fork_generation;
 
 static void init_fork_detect(void) {
-  if (*g_force_madv_wipeonfork_bss_get()) {
+  if (g_force_madv_wipeonfork) {
     return;
   }
 
@@ -74,13 +72,13 @@ static void init_fork_detect(void) {
   }
 
   CRYPTO_atomic_store_u32(addr, 1);
-  *g_fork_detect_addr_bss_get() = addr;
-  *g_fork_generation_bss_get() = 1;
+  g_fork_detect_addr = addr;
+  g_fork_generation = 1;
 
 }
 
 uint64_t CRYPTO_get_fork_generation(void) {
-  CRYPTO_once(g_fork_detect_once_bss_get(), init_fork_detect);
+  CRYPTO_once(&g_fork_detect_once, init_fork_detect);
 
   // In a single-threaded process, there are obviously no races because there's
   // only a single mutator in the address space.
@@ -93,12 +91,12 @@ uint64_t CRYPTO_get_fork_generation(void) {
   // child process is single-threaded, the child may become multi-threaded
   // before it observes this. Therefore, we must synchronize the logic below.
 
-  CRYPTO_atomic_u32 *const flag_ptr = *g_fork_detect_addr_bss_get();
+  CRYPTO_atomic_u32 *const flag_ptr = g_fork_detect_addr;
   if (flag_ptr == NULL) {
     // Our kernel is too old to support |MADV_WIPEONFORK| or
     // |g_force_madv_wipeonfork| is set.
-    if (*g_force_madv_wipeonfork_bss_get() &&
-        *g_force_madv_wipeonfork_enabled_bss_get()) {
+    if (g_force_madv_wipeonfork &&
+        g_force_madv_wipeonfork_enabled) {
       // A constant generation number to simulate support, even if the kernel
       // doesn't support it.
       return 42;
@@ -114,7 +112,7 @@ uint64_t CRYPTO_get_fork_generation(void) {
 
   // In the common case, try to observe the flag without taking a lock. This
   // avoids cacheline contention in the PRNG.
-  uint64_t *const generation_ptr = g_fork_generation_bss_get();
+  uint64_t *const generation_ptr = &g_fork_generation;
   if (CRYPTO_atomic_load_u32(flag_ptr) != 0) {
     // If we observe a non-zero flag, it is safe to read |generation_ptr|
     // without a lock. The flag and generation number are fixed for this copy of
@@ -125,7 +123,7 @@ uint64_t CRYPTO_get_fork_generation(void) {
   // The flag was zero. The generation number must be incremented, but other
   // threads may have concurrently observed the zero, so take a lock before
   // incrementing.
-  CRYPTO_MUTEX *const lock = g_fork_detect_lock_bss_get();
+  CRYPTO_MUTEX *const lock = &g_fork_detect_lock;
   CRYPTO_MUTEX_lock_write(lock);
   uint64_t current_generation = *generation_ptr;
   if (CRYPTO_atomic_load_u32(flag_ptr) == 0) {
@@ -147,35 +145,35 @@ uint64_t CRYPTO_get_fork_generation(void) {
 }
 
 void CRYPTO_fork_detect_force_madv_wipeonfork_for_testing(int on) {
-  *g_force_madv_wipeonfork_bss_get() = 1;
-  *g_force_madv_wipeonfork_enabled_bss_get() = on;
+  g_force_madv_wipeonfork = 1;
+  g_force_madv_wipeonfork_enabled = on;
 }
 
 #elif defined(OPENSSL_FORK_DETECTION_PTHREAD_ATFORK)
 
-DEFINE_STATIC_ONCE(g_pthread_fork_detection_once);
-DEFINE_BSS_GET(uint64_t, g_atfork_fork_generation);
+static CRYPTO_once_t g_pthread_fork_detection_once = CRYPTO_ONCE_INIT;
+static uint64_t g_atfork_fork_generation;
 
 static void we_are_forked(void) {
   // Immediately after a fork, the process must be single-threaded.
-  uint64_t value = *g_atfork_fork_generation_bss_get() + 1;
+  uint64_t value = g_atfork_fork_generation + 1;
   if (value == 0) {
     value = 1;
   }
-  *g_atfork_fork_generation_bss_get() = value;
+  g_atfork_fork_generation = value;
 }
 
 static void init_pthread_fork_detection(void) {
   if (pthread_atfork(NULL, NULL, we_are_forked) != 0) {
     abort();
   }
-  *g_atfork_fork_generation_bss_get() = 1;
+  g_atfork_fork_generation = 1;
 }
 
 uint64_t CRYPTO_get_fork_generation(void) {
-  CRYPTO_once(g_pthread_fork_detection_once_bss_get(), init_pthread_fork_detection);
+  CRYPTO_once(&g_pthread_fork_detection_once, init_pthread_fork_detection);
 
-  return *g_atfork_fork_generation_bss_get();
+  return g_atfork_fork_generation;
 }
 
 #elif defined(OPENSSL_DOES_NOT_FORK)
diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/getentropy.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/getentropy.c
index ccab73e1..02ddc4d8 100644
--- a/Sources/CCryptoBoringSSL/crypto/rand_extra/getentropy.c
+++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/getentropy.c
@@ -18,7 +18,8 @@
 
 #include <CCryptoBoringSSL_rand.h>
 
-#include "../fipsmodule/rand/internal.h"
+#include "../bcm_support.h"
+#include "sysrand_internal.h"
 
 #if defined(OPENSSL_RAND_GETENTROPY)
 
@@ -30,6 +31,8 @@
 #include <sys/random.h>
 #endif
 
+void CRYPTO_init_sysrand(void) {}
+
 // CRYPTO_sysrand puts |requested| random bytes into |out|.
 void CRYPTO_sysrand(uint8_t *out, size_t requested) {
   while (requested > 0) {
@@ -45,6 +48,11 @@ void CRYPTO_sysrand(uint8_t *out, size_t requested) {
   }
 }
 
+int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) {
+  CRYPTO_sysrand(buf, len);
+  return 1;
+}
+
 void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) {
   CRYPTO_sysrand(out, requested);
 }
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/getrandom_fillin.h b/Sources/CCryptoBoringSSL/crypto/rand_extra/getrandom_fillin.h
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/getrandom_fillin.h
rename to Sources/CCryptoBoringSSL/crypto/rand_extra/getrandom_fillin.h
diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/ios.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/ios.c
index ca3b7127..33c4bf92 100644
--- a/Sources/CCryptoBoringSSL/crypto/rand_extra/ios.c
+++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/ios.c
@@ -14,19 +14,27 @@
 
 #include <CCryptoBoringSSL_rand.h>
 
-#include "../fipsmodule/rand/internal.h"
+#include "../bcm_support.h"
+#include "sysrand_internal.h"
 
 #if defined(OPENSSL_RAND_IOS)
 #include <stdlib.h>
 
 #include <CommonCrypto/CommonRandom.h>
 
+void CRYPTO_init_sysrand(void) {}
+
 void CRYPTO_sysrand(uint8_t *out, size_t requested) {
   if (CCRandomGenerateBytes(out, requested) != kCCSuccess) {
     abort();
   }
 }
 
+int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) {
+  CRYPTO_sysrand(buf, len);
+  return 1;
+}
+
 void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) {
   CRYPTO_sysrand(out, requested);
 }
diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/passive.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/passive.c
index fcaaa37b..e693ba1a 100644
--- a/Sources/CCryptoBoringSSL/crypto/rand_extra/passive.c
+++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/passive.c
@@ -14,11 +14,27 @@
 
 #include <CCryptoBoringSSL_ctrdrbg.h>
 
-#include "../fipsmodule/rand/internal.h"
+#include "../fipsmodule/bcm_interface.h"
+#include "../bcm_support.h"
 #include "../internal.h"
 
 #if defined(BORINGSSL_FIPS)
 
+// passive_get_seed_entropy writes |out_entropy_len| bytes of entropy, suitable
+// for seeding a DRBG, to |out_entropy|. It sets |*out_used_cpu| to one if the
+// entropy came directly from the CPU and zero if it came from the OS. It
+// actively obtains entropy from the CPU/OS
+static void passive_get_seed_entropy(uint8_t *out_entropy,
+                                     size_t out_entropy_len,
+                                     int *out_want_additional_input) {
+  *out_want_additional_input = 0;
+  if (bcm_success(BCM_rand_bytes_hwrng(out_entropy, out_entropy_len))) {
+    *out_want_additional_input = 1;
+  } else {
+    CRYPTO_sysrand_for_seed(out_entropy, out_entropy_len);
+  }
+}
+
 #define ENTROPY_READ_LEN \
   (/* last_block size */ 16 + CTR_DRBG_ENTROPY_LEN * BORINGSSL_FIPS_OVERREAD)
 
@@ -143,7 +159,7 @@ void RAND_need_entropy(size_t bytes_needed) {
   if (get_seed_from_daemon(buf, todo)) {
     want_additional_input = 1;
   } else {
-    CRYPTO_get_seed_entropy(buf, todo, &want_additional_input);
+    passive_get_seed_entropy(buf, todo, &want_additional_input);
   }
 
   if (boringssl_fips_break_test("CRNG")) {
@@ -152,7 +168,7 @@ void RAND_need_entropy(size_t bytes_needed) {
     OPENSSL_memset(buf, 0, todo);
   }
 
-  RAND_load_entropy(buf, todo, want_additional_input);
+  BCM_rand_load_entropy(buf, todo, want_additional_input);
 }
 
 #endif  // FIPS
diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/rand_extra.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/rand_extra.c
index 61e685e5..a0b9f10b 100644
--- a/Sources/CCryptoBoringSSL/crypto/rand_extra/rand_extra.c
+++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/rand_extra.c
@@ -12,11 +12,21 @@
  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 
+#include <limits.h>
+
 #include <CCryptoBoringSSL_rand.h>
 
-#include <limits.h>
+#include "../bcm_support.h"
+#include "../fipsmodule/bcm_interface.h"
 
 
+int RAND_bytes(uint8_t *buf, size_t len) {
+  BCM_rand_bytes(buf, len);
+  return 1;
+}
+
+int RAND_pseudo_bytes(uint8_t *buf, size_t len) { return RAND_bytes(buf, len); }
+
 void RAND_seed(const void *buf, int num) {
   // OpenSSH calls |RAND_seed| before jailing on the assumption that any needed
   // file descriptors etc will be opened.
@@ -28,7 +38,7 @@ int RAND_load_file(const char *path, long num) {
   if (num < 0) {  // read the "whole file"
     return 1;
   } else if (num <= INT_MAX) {
-    return (int) num;
+    return (int)num;
   } else {
     return INT_MAX;
   }
@@ -38,37 +48,30 @@ const char *RAND_file_name(char *buf, size_t num) { return NULL; }
 
 void RAND_add(const void *buf, int num, double entropy) {}
 
-int RAND_egd(const char *path) {
-  return 255;
-}
+int RAND_egd(const char *path) { return 255; }
 
-int RAND_poll(void) {
-  return 1;
-}
+int RAND_poll(void) { return 1; }
 
-int RAND_status(void) {
-  return 1;
-}
+int RAND_status(void) { return 1; }
 
 static const struct rand_meth_st kSSLeayMethod = {
-  RAND_seed,
-  RAND_bytes,
-  RAND_cleanup,
-  RAND_add,
-  RAND_pseudo_bytes,
-  RAND_status,
+    RAND_seed, RAND_bytes,        RAND_cleanup,
+    RAND_add,  RAND_pseudo_bytes, RAND_status,
 };
 
-RAND_METHOD *RAND_SSLeay(void) {
-  return (RAND_METHOD*) &kSSLeayMethod;
-}
+RAND_METHOD *RAND_SSLeay(void) { return (RAND_METHOD *)&kSSLeayMethod; }
 
-RAND_METHOD *RAND_OpenSSL(void) {
-  return RAND_SSLeay();
-}
+RAND_METHOD *RAND_OpenSSL(void) { return RAND_SSLeay(); }
 
 const RAND_METHOD *RAND_get_rand_method(void) { return RAND_SSLeay(); }
 
 int RAND_set_rand_method(const RAND_METHOD *method) { return 1; }
 
 void RAND_cleanup(void) {}
+
+void RAND_get_system_entropy_for_custom_prng(uint8_t *buf, size_t len) {
+  if (len > 256) {
+    abort();
+  }
+  CRYPTO_sysrand_for_seed(buf, len);
+}
diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/sysrand_internal.h b/Sources/CCryptoBoringSSL/crypto/rand_extra/sysrand_internal.h
new file mode 100644
index 00000000..94e99ef1
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/sysrand_internal.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2024, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_CRYPTO_SYSRAND_INTERNAL_H
+#define OPENSSL_HEADER_CRYPTO_SYSRAND_INTERNAL_H
+
+#include <CCryptoBoringSSL_base.h>
+
+#if defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE)
+#define OPENSSL_RAND_DETERMINISTIC
+#elif defined(OPENSSL_TRUSTY)
+#define OPENSSL_RAND_TRUSTY
+#elif defined(OPENSSL_WINDOWS)
+#define OPENSSL_RAND_WINDOWS
+#elif defined(OPENSSL_LINUX)
+#define OPENSSL_RAND_URANDOM
+#elif defined(OPENSSL_APPLE) && !defined(OPENSSL_MACOS)
+// Unlike macOS, iOS and similar hide away getentropy().
+#define OPENSSL_RAND_IOS
+#else
+// By default if you are integrating BoringSSL we expect you to
+// provide getentropy from the <unistd.h> header file.
+#define OPENSSL_RAND_GETENTROPY
+#endif
+
+#endif  // OPENSSL_HEADER_CRYPTO__SYSRAND_INTERNAL_H
diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/trusty.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/trusty.c
index 88be7cbc..57e4148a 100644
--- a/Sources/CCryptoBoringSSL/crypto/rand_extra/trusty.c
+++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/trusty.c
@@ -14,7 +14,8 @@
 
 #include <CCryptoBoringSSL_rand.h>
 
-#include "../fipsmodule/rand/internal.h"
+#include "../bcm_support.h"
+#include "sysrand_internal.h"
 
 #if defined(OPENSSL_RAND_TRUSTY)
 #include <stdint.h>
@@ -25,12 +26,19 @@
 
 #include <lib/rng/trusty_rng.h>
 
+void CRYPTO_init_sysrand(void) {}
+
 void CRYPTO_sysrand(uint8_t *out, size_t requested) {
   if (trusty_rng_hw_rand(out, requested) != NO_ERROR) {
     abort();
   }
 }
 
+int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) {
+  CRYPTO_sysrand(buf, len);
+  return 1;
+}
+
 void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) {
   CRYPTO_sysrand(out, requested);
 }
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/urandom.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/urandom.c
similarity index 91%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/urandom.c
rename to Sources/CCryptoBoringSSL/crypto/rand_extra/urandom.c
index 5af61e91..eabb7e8d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rand/urandom.c
+++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/urandom.c
@@ -18,7 +18,8 @@
 
 #include <CCryptoBoringSSL_rand.h>
 
-#include "internal.h"
+#include "../bcm_support.h"
+#include "sysrand_internal.h"
 
 #if defined(OPENSSL_RAND_URANDOM)
 
@@ -62,8 +63,7 @@
 #include <CCryptoBoringSSL_mem.h>
 
 #include "getrandom_fillin.h"
-#include "../delocate.h"
-#include "../../internal.h"
+#include "../internal.h"
 
 
 #if defined(USE_NR_getrandom)
@@ -96,17 +96,17 @@ static ssize_t boringssl_getrandom(void *buf, size_t buf_len, unsigned flags) {
 static const int kHaveGetrandom = -3;
 
 // urandom_fd is a file descriptor to /dev/urandom. It's protected by |once|.
-DEFINE_BSS_GET(int, urandom_fd)
+static int urandom_fd;
 
 #if defined(USE_NR_getrandom)
 
 // getrandom_ready is one if |getrandom| had been initialized by the time
 // |init_once| was called and zero otherwise.
-DEFINE_BSS_GET(int, getrandom_ready)
+static int getrandom_ready;
 
 // extra_getrandom_flags_for_seed contains a value that is ORed into the flags
 // for getrandom() when reading entropy for a seed.
-DEFINE_BSS_GET(int, extra_getrandom_flags_for_seed)
+static int extra_getrandom_flags_for_seed;
 
 // On Android, check a system property to decide whether to set
 // |extra_getrandom_flags_for_seed| otherwise they will default to zero.  If
@@ -123,14 +123,14 @@ static void maybe_set_extra_getrandom_flags(void) {
 
   value[length] = 0;
   if (OPENSSL_strcasecmp(value, "true") == 0) {
-    *extra_getrandom_flags_for_seed_bss_get() = GRND_RANDOM;
+    extra_getrandom_flags_for_seed = GRND_RANDOM;
   }
 #endif
 }
 
 #endif  // USE_NR_getrandom
 
-DEFINE_STATIC_ONCE(rand_once)
+static CRYPTO_once_t rand_once = CRYPTO_ONCE_INIT;
 
 // init_once initializes the state of this module to values previously
 // requested. This is the only function that modifies |urandom_fd|, which may be
@@ -142,7 +142,7 @@ static void init_once(void) {
   ssize_t getrandom_ret =
       boringssl_getrandom(&dummy, sizeof(dummy), GRND_NONBLOCK);
   if (getrandom_ret == 1) {
-    *getrandom_ready_bss_get() = 1;
+    getrandom_ready = 1;
     have_getrandom = 1;
   } else if (getrandom_ret == -1 && errno == EAGAIN) {
     // We have getrandom, but the entropy pool has not been initialized yet.
@@ -157,7 +157,7 @@ static void init_once(void) {
   }
 
   if (have_getrandom) {
-    *urandom_fd_bss_get() = kHaveGetrandom;
+    urandom_fd = kHaveGetrandom;
     maybe_set_extra_getrandom_flags();
     return;
   }
@@ -185,19 +185,19 @@ static void init_once(void) {
     abort();
   }
 
-  *urandom_fd_bss_get() = fd;
+  urandom_fd = fd;
 }
 
-DEFINE_STATIC_ONCE(wait_for_entropy_once)
+static CRYPTO_once_t wait_for_entropy_once = CRYPTO_ONCE_INIT;
 
 static void wait_for_entropy(void) {
-  int fd = *urandom_fd_bss_get();
+  int fd = urandom_fd;
   if (fd == kHaveGetrandom) {
     // |getrandom| and |getentropy| support blocking in |fill_with_entropy|
     // directly. For |getrandom|, we first probe with a non-blocking call to aid
     // debugging.
 #if defined(USE_NR_getrandom)
-    if (*getrandom_ready_bss_get()) {
+    if (getrandom_ready) {
       // The entropy pool was already initialized in |init_once|.
       return;
     }
@@ -256,13 +256,13 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) {
 
 #if defined (USE_NR_getrandom)
   if (seed) {
-    getrandom_flags |= *extra_getrandom_flags_for_seed_bss_get();
+    getrandom_flags |= extra_getrandom_flags_for_seed;
   }
 #endif
 
   CRYPTO_init_sysrand();
   if (block) {
-    CRYPTO_once(wait_for_entropy_once_bss_get(), wait_for_entropy);
+    CRYPTO_once(&wait_for_entropy_once, wait_for_entropy);
   }
 
   // Clear |errno| so it has defined value if |read| or |getrandom|
@@ -271,7 +271,7 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) {
   while (len > 0) {
     ssize_t r;
 
-    if (*urandom_fd_bss_get() == kHaveGetrandom) {
+    if (urandom_fd == kHaveGetrandom) {
 #if defined(USE_NR_getrandom)
       r = boringssl_getrandom(out, len, getrandom_flags);
 #else  // USE_NR_getrandom
@@ -280,7 +280,7 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) {
 #endif
     } else {
       do {
-        r = read(*urandom_fd_bss_get(), out, len);
+        r = read(urandom_fd, out, len);
       } while (r == -1 && errno == EINTR);
     }
 
@@ -295,7 +295,7 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) {
 }
 
 void CRYPTO_init_sysrand(void) {
-  CRYPTO_once(rand_once_bss_get(), init_once);
+  CRYPTO_once(&rand_once, init_once);
 }
 
 // CRYPTO_sysrand puts |requested| random bytes into |out|.
diff --git a/Sources/CCryptoBoringSSL/crypto/rand_extra/windows.c b/Sources/CCryptoBoringSSL/crypto/rand_extra/windows.c
index d7517af4..89351610 100644
--- a/Sources/CCryptoBoringSSL/crypto/rand_extra/windows.c
+++ b/Sources/CCryptoBoringSSL/crypto/rand_extra/windows.c
@@ -14,7 +14,9 @@
 
 #include <CCryptoBoringSSL_rand.h>
 
-#include "../fipsmodule/rand/internal.h"
+#include "../bcm_support.h"
+#include "../internal.h"
+#include "sysrand_internal.h"
 
 #if defined(OPENSSL_RAND_WINDOWS)
 
@@ -88,6 +90,11 @@ void CRYPTO_sysrand(uint8_t *out, size_t requested) {
 
 #endif  // WINAPI_PARTITION_APP && !WINAPI_PARTITION_DESKTOP
 
+int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) {
+  CRYPTO_sysrand(buf, len);
+  return 1;
+}
+
 void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) {
   CRYPTO_sysrand(out, requested);
 }
diff --git a/Sources/CCryptoBoringSSL/crypto/rsa_extra/internal.h b/Sources/CCryptoBoringSSL/crypto/rsa_extra/internal.h
index 6317cfc0..b47b9638 100644
--- a/Sources/CCryptoBoringSSL/crypto/rsa_extra/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/rsa_extra/internal.h
@@ -58,6 +58,8 @@
 #ifndef OPENSSL_HEADER_RSA_EXTRA_INTERNAL_H
 #define OPENSSL_HEADER_RSA_EXTRA_INTERNAL_H
 
+#include <CCryptoBoringSSL_base.h>
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/sha/sha1.c b/Sources/CCryptoBoringSSL/crypto/sha/sha1.c
new file mode 100644
index 00000000..f22b0683
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/crypto/sha/sha1.c
@@ -0,0 +1,52 @@
+/* Copyright (c) 2024, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <CCryptoBoringSSL_sha.h>
+
+#include <CCryptoBoringSSL_mem.h>
+
+#include "../fipsmodule/bcm_interface.h"
+
+int SHA1_Init(SHA_CTX *sha) {
+  BCM_sha1_init(sha);
+  return 1;
+}
+
+int SHA1_Update(SHA_CTX *sha, const void *data, size_t len) {
+  BCM_sha1_update(sha, data, len);
+  return 1;
+}
+
+int SHA1_Final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *sha) {
+  BCM_sha1_final(out, sha);
+  return 1;
+}
+
+uint8_t *SHA1(const uint8_t *data, size_t len, uint8_t out[SHA_DIGEST_LENGTH]) {
+  SHA_CTX ctx;
+  BCM_sha1_init(&ctx);
+  BCM_sha1_update(&ctx, data, len);
+  BCM_sha1_final(out, &ctx);
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+void SHA1_Transform(SHA_CTX *sha, const uint8_t block[SHA_CBLOCK]) {
+  BCM_sha1_transform(sha, block);
+}
+
+void CRYPTO_fips_186_2_prf(uint8_t *out, size_t out_len,
+                           const uint8_t xkey[SHA_DIGEST_LENGTH]) {
+  BCM_fips_186_2_prf(out, out_len, xkey);
+}
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/spx.c b/Sources/CCryptoBoringSSL/crypto/spx/spx.c
index bc2b3d22..0f5a5652 100644
--- a/Sources/CCryptoBoringSSL/crypto/spx/spx.c
+++ b/Sources/CCryptoBoringSSL/crypto/spx/spx.c
@@ -16,24 +16,25 @@
 
 #include <string.h>
 
+#define OPENSSL_UNSTABLE_EXPERIMENTAL_SPX
 #include <experimental/CCryptoBoringSSL_spx.h>
 #include <CCryptoBoringSSL_rand.h>
 
-#include "./address.h"
-#include "./fors.h"
-#include "./merkle.h"
-#include "./params.h"
+#include "./spx_address.h"
+#include "./spx_fors.h"
+#include "./spx_merkle.h"
+#include "./spx_params.h"
 #include "./spx_util.h"
-#include "./thash.h"
+#include "./spx_thash.h"
 
-void spx_generate_key(uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES],
+void SPX_generate_key(uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES],
                       uint8_t out_secret_key[SPX_SECRET_KEY_BYTES]) {
   uint8_t seed[3 * SPX_N];
   RAND_bytes(seed, 3 * SPX_N);
-  spx_generate_key_from_seed(out_public_key, out_secret_key, seed);
+  SPX_generate_key_from_seed(out_public_key, out_secret_key, seed);
 }
 
-void spx_generate_key_from_seed(uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES],
+void SPX_generate_key_from_seed(uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES],
                                 uint8_t out_secret_key[SPX_SECRET_KEY_BYTES],
                                 const uint8_t seed[3 * SPX_N]) {
   // Initialize SK.seed || SK.prf || PK.seed from seed.
@@ -51,7 +52,7 @@ void spx_generate_key_from_seed(uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES],
   memcpy(out_secret_key + 3 * SPX_N, out_public_key + SPX_N, SPX_N);
 }
 
-void spx_sign(uint8_t out_signature[SPX_SIGNATURE_BYTES],
+void SPX_sign(uint8_t out_signature[SPX_SIGNATURE_BYTES],
               const uint8_t secret_key[SPX_SECRET_KEY_BYTES],
               const uint8_t *msg, size_t msg_len, int randomized) {
   uint8_t addr[32] = {0};
@@ -102,7 +103,7 @@ void spx_sign(uint8_t out_signature[SPX_SIGNATURE_BYTES],
               idx_leaf, sk_seed, pk_seed);
 }
 
-int spx_verify(const uint8_t signature[SPX_SIGNATURE_BYTES],
+int SPX_verify(const uint8_t signature[SPX_SIGNATURE_BYTES],
                const uint8_t public_key[SPX_SECRET_KEY_BYTES],
                const uint8_t *msg, size_t msg_len) {
   uint8_t addr[32] = {0};
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/address.c b/Sources/CCryptoBoringSSL/crypto/spx/spx_address.c
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/spx/address.c
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_address.c
index 8c489b1b..4e691572 100644
--- a/Sources/CCryptoBoringSSL/crypto/spx/address.c
+++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_address.c
@@ -17,7 +17,7 @@
 #include <string.h>
 
 #include "../internal.h"
-#include "./address.h"
+#include "./spx_address.h"
 #include "./spx_util.h"
 
 
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/address.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_address.h
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/spx/address.h
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_address.h
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/fors.c b/Sources/CCryptoBoringSSL/crypto/spx/spx_fors.c
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/spx/fors.c
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_fors.c
index 8dcbb5c7..e270b2bd 100644
--- a/Sources/CCryptoBoringSSL/crypto/spx/fors.c
+++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_fors.c
@@ -16,11 +16,11 @@
 
 #include <string.h>
 
-#include "./address.h"
-#include "./fors.h"
-#include "./params.h"
+#include "./spx_address.h"
+#include "./spx_fors.h"
+#include "./spx_params.h"
 #include "./spx_util.h"
-#include "./thash.h"
+#include "./spx_thash.h"
 
 void spx_fors_sk_gen(uint8_t *fors_sk, uint32_t idx,
                      const uint8_t sk_seed[SPX_N], const uint8_t pk_seed[SPX_N],
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/fors.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_fors.h
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/spx/fors.h
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_fors.h
index 77d677d9..cb6003e4 100644
--- a/Sources/CCryptoBoringSSL/crypto/spx/fors.h
+++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_fors.h
@@ -17,7 +17,7 @@
 
 #include <CCryptoBoringSSL_base.h>
 
-#include "./params.h"
+#include "./spx_params.h"
 
 #if defined(__cplusplus)
 extern "C" {
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/merkle.c b/Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.c
similarity index 97%
rename from Sources/CCryptoBoringSSL/crypto/spx/merkle.c
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.c
index 5f699923..02d3e214 100644
--- a/Sources/CCryptoBoringSSL/crypto/spx/merkle.c
+++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.c
@@ -16,11 +16,11 @@
 
 #include <string.h>
 
-#include "./address.h"
-#include "./merkle.h"
-#include "./params.h"
-#include "./thash.h"
-#include "./wots.h"
+#include "./spx_address.h"
+#include "./spx_merkle.h"
+#include "./spx_params.h"
+#include "./spx_thash.h"
+#include "./spx_wots.h"
 
 void spx_treehash(uint8_t out_pk[SPX_N], const uint8_t sk_seed[SPX_N],
                   uint32_t i /*target node index*/,
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/merkle.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.h
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/spx/merkle.h
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.h
index 7c422953..d659ab0a 100644
--- a/Sources/CCryptoBoringSSL/crypto/spx/merkle.h
+++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_merkle.h
@@ -19,7 +19,7 @@
 
 #include <sys/types.h>
 
-#include "./params.h"
+#include "./spx_params.h"
 
 #if defined(__cplusplus)
 extern "C" {
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/params.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_params.h
similarity index 100%
rename from Sources/CCryptoBoringSSL/crypto/spx/params.h
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_params.h
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/thash.c b/Sources/CCryptoBoringSSL/crypto/spx/spx_thash.c
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/spx/thash.c
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_thash.c
index e9a8ac8b..0fb86470 100644
--- a/Sources/CCryptoBoringSSL/crypto/spx/thash.c
+++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_thash.c
@@ -20,9 +20,9 @@
 
 #include <CCryptoBoringSSL_sha.h>
 
-#include "./params.h"
+#include "./spx_params.h"
 #include "./spx_util.h"
-#include "./thash.h"
+#include "./spx_thash.h"
 
 static void spx_thash(uint8_t *output, const uint8_t *input,
                       size_t input_blocks, const uint8_t pk_seed[SPX_N],
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/thash.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_thash.h
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/spx/thash.h
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_thash.h
index 8189d727..5ba46c3f 100644
--- a/Sources/CCryptoBoringSSL/crypto/spx/thash.h
+++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_thash.h
@@ -17,7 +17,7 @@
 
 #include <CCryptoBoringSSL_base.h>
 
-#include "./params.h"
+#include "./spx_params.h"
 
 #if defined(__cplusplus)
 extern "C" {
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/wots.c b/Sources/CCryptoBoringSSL/crypto/spx/spx_wots.c
similarity index 97%
rename from Sources/CCryptoBoringSSL/crypto/spx/wots.c
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_wots.c
index 21f35b72..7b840046 100644
--- a/Sources/CCryptoBoringSSL/crypto/spx/wots.c
+++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_wots.c
@@ -18,11 +18,11 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "./address.h"
-#include "./params.h"
+#include "./spx_address.h"
+#include "./spx_params.h"
 #include "./spx_util.h"
-#include "./thash.h"
-#include "./wots.h"
+#include "./spx_thash.h"
+#include "./spx_wots.h"
 
 // Chaining function used in WOTS+.
 static void chain(uint8_t *output, const uint8_t *input, uint32_t start,
diff --git a/Sources/CCryptoBoringSSL/crypto/spx/wots.h b/Sources/CCryptoBoringSSL/crypto/spx/spx_wots.h
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/spx/wots.h
rename to Sources/CCryptoBoringSSL/crypto/spx/spx_wots.h
index f40d5625..e9087f3f 100644
--- a/Sources/CCryptoBoringSSL/crypto/spx/wots.h
+++ b/Sources/CCryptoBoringSSL/crypto/spx/spx_wots.h
@@ -17,7 +17,7 @@
 
 #include <CCryptoBoringSSL_base.h>
 
-#include "./params.h"
+#include "./spx_params.h"
 
 #if defined(__cplusplus)
 extern "C" {
diff --git a/Sources/CCryptoBoringSSL/crypto/x509/by_dir.c b/Sources/CCryptoBoringSSL/crypto/x509/by_dir.c
index d7cd8048..6b71063e 100644
--- a/Sources/CCryptoBoringSSL/crypto/x509/by_dir.c
+++ b/Sources/CCryptoBoringSSL/crypto/x509/by_dir.c
@@ -313,14 +313,20 @@ static int get_cert_by_subject(X509_LOOKUP *xl, int type, X509_NAME *name,
                  k);
         if (type == X509_LU_X509) {
           if ((X509_load_cert_file(xl, b->data, ent->dir_type)) == 0) {
+            // Don't expose the lower level error, All of these boil
+            // down to "we could not find a CA".
+            ERR_clear_error();
             break;
           }
         } else if (type == X509_LU_CRL) {
           if ((X509_load_crl_file(xl, b->data, ent->dir_type)) == 0) {
+            // Don't expose the lower level error, All of these boil
+            // down to "we could not find a CRL".
+            ERR_clear_error();
             break;
           }
         }
-        // else case will caught higher up
+        // The lack of a CA or CRL will be caught higher up
         k++;
       }
 
diff --git a/Sources/CCryptoBoringSSL/crypto/x509/internal.h b/Sources/CCryptoBoringSSL/crypto/x509/internal.h
index 6a1beefc..ad42c3ee 100644
--- a/Sources/CCryptoBoringSSL/crypto/x509/internal.h
+++ b/Sources/CCryptoBoringSSL/crypto/x509/internal.h
@@ -341,8 +341,6 @@ struct x509_store_st {
 
   // Callbacks for various operations
   X509_STORE_CTX_verify_cb verify_cb;       // error callback
-  X509_STORE_CTX_get_crl_fn get_crl;        // retrieve CRL
-  X509_STORE_CTX_check_crl_fn check_crl;    // Check CRL validity
 
   CRYPTO_refcount_t references;
 } /* X509_STORE */;
@@ -374,8 +372,6 @@ struct x509_store_ctx_st {
 
   // Callbacks for various operations
   X509_STORE_CTX_verify_cb verify_cb;       // error callback
-  X509_STORE_CTX_get_crl_fn get_crl;        // retrieve CRL
-  X509_STORE_CTX_check_crl_fn check_crl;    // Check CRL validity
 
   // The following is built up
   int last_untrusted;     // index of last untrusted cert
@@ -422,7 +418,7 @@ int x509_print_rsa_pss_params(BIO *bp, const X509_ALGOR *sigalg, int indent,
 // Signature algorithm functions.
 
 // x509_digest_sign_algorithm encodes the signing parameters of |ctx| as an
-// AlgorithmIdentifer and saves the result in |algor|. It returns one on
+// AlgorithmIdentifier and saves the result in |algor|. It returns one on
 // success, or zero on error.
 int x509_digest_sign_algorithm(EVP_MD_CTX *ctx, X509_ALGOR *algor);
 
diff --git a/Sources/CCryptoBoringSSL/crypto/x509/rsa_pss.c b/Sources/CCryptoBoringSSL/crypto/x509/rsa_pss.c
index 9c868ce4..5c74fb5d 100644
--- a/Sources/CCryptoBoringSSL/crypto/x509/rsa_pss.c
+++ b/Sources/CCryptoBoringSSL/crypto/x509/rsa_pss.c
@@ -125,7 +125,11 @@ static int rsa_md_to_algor(X509_ALGOR **palg, const EVP_MD *md) {
   if (*palg == NULL) {
     return 0;
   }
-  X509_ALGOR_set_md(*palg, md);
+  if (!X509_ALGOR_set_md(*palg, md)) {
+    X509_ALGOR_free(*palg);
+    *palg = NULL;
+    return 0;
+  }
   return 1;
 }
 
diff --git a/Sources/CCryptoBoringSSL/crypto/x509/v3_utl.c b/Sources/CCryptoBoringSSL/crypto/x509/v3_utl.c
index b65f67c5..7da46d17 100644
--- a/Sources/CCryptoBoringSSL/crypto/x509/v3_utl.c
+++ b/Sources/CCryptoBoringSSL/crypto/x509/v3_utl.c
@@ -82,10 +82,10 @@ static void str_free(OPENSSL_STRING str);
 static int append_ia5(STACK_OF(OPENSSL_STRING) **sk,
                       const ASN1_IA5STRING *email);
 
-static int ipv4_from_asc(unsigned char v4[4], const char *in);
-static int ipv6_from_asc(unsigned char v6[16], const char *in);
+static int ipv4_from_asc(uint8_t v4[4], const char *in);
+static int ipv6_from_asc(uint8_t v6[16], const char *in);
 static int ipv6_cb(const char *elem, size_t len, void *usr);
-static int ipv6_hex(unsigned char *out, const char *in, size_t inlen);
+static int ipv6_hex(uint8_t *out, const char *in, size_t inlen);
 
 // Add a CONF_VALUE name value pair to stack
 
@@ -1154,7 +1154,7 @@ ASN1_OCTET_STRING *a2i_IPADDRESS_NC(const char *ipasc) {
   return NULL;
 }
 
-int x509v3_a2i_ipadd(unsigned char ipout[16], const char *ipasc) {
+int x509v3_a2i_ipadd(uint8_t ipout[16], const char *ipasc) {
   // If string contains a ':' assume IPv6
 
   if (strchr(ipasc, ':')) {
@@ -1170,25 +1170,58 @@ int x509v3_a2i_ipadd(unsigned char ipout[16], const char *ipasc) {
   }
 }
 
-static int ipv4_from_asc(unsigned char v4[4], const char *in) {
-  int a0, a1, a2, a3;
-  if (sscanf(in, "%d.%d.%d.%d", &a0, &a1, &a2, &a3) != 4) {
+// get_ipv4_component consumes one IPv4 component, terminated by either '.' or
+// the end of the string, from |*str|. On success, it returns one, sets |*out|
+// to the component, and advances |*str| to the first unconsumed character. On
+// invalid input, it returns zero.
+static int get_ipv4_component(uint8_t *out_byte, const char **str) {
+  // Store a slightly larger intermediary so the overflow check is easier.
+  uint32_t out = 0;
+  for (;;) {
+    if (!OPENSSL_isdigit(**str)) {
+      return 0;
+    }
+    out = (out * 10) + (**str - '0');
+    if (out > 255) {
+      // Components must be 8-bit.
+      return 0;
+    }
+    (*str)++;
+    if ((**str) == '.' || (**str) == '\0') {
+      *out_byte = (uint8_t)out;
+      return 1;
+    }
+    if (out == 0) {
+      // Reject extra leading zeros. Parsers sometimes treat them as octal, so
+      // accepting them would misinterpret input.
+      return 0;
+    }
+  }
+}
+
+// get_ipv4_dot consumes a '.' from |*str| and advances it. It returns one on
+// success and zero if |*str| does not point to a '.'.
+static int get_ipv4_dot(const char **str) {
+  if (**str != '.') {
     return 0;
   }
-  if ((a0 < 0) || (a0 > 255) || (a1 < 0) || (a1 > 255) || (a2 < 0) ||
-      (a2 > 255) || (a3 < 0) || (a3 > 255)) {
+  (*str)++;
+  return 1;
+}
+
+static int ipv4_from_asc(uint8_t v4[4], const char *in) {
+  if (!get_ipv4_component(&v4[0], &in) || !get_ipv4_dot(&in) ||
+      !get_ipv4_component(&v4[1], &in) || !get_ipv4_dot(&in) ||
+      !get_ipv4_component(&v4[2], &in) || !get_ipv4_dot(&in) ||
+      !get_ipv4_component(&v4[3], &in) || *in != '\0') {
     return 0;
   }
-  v4[0] = a0;
-  v4[1] = a1;
-  v4[2] = a2;
-  v4[3] = a3;
   return 1;
 }
 
 typedef struct {
   // Temporary store for IPV6 output
-  unsigned char tmp[16];
+  uint8_t tmp[16];
   // Total number of bytes in tmp
   int total;
   // The position of a zero (corresponding to '::')
@@ -1197,7 +1230,7 @@ typedef struct {
   int zero_cnt;
 } IPV6_STAT;
 
-static int ipv6_from_asc(unsigned char v6[16], const char *in) {
+static int ipv6_from_asc(uint8_t v6[16], const char *in) {
   IPV6_STAT v6stat;
   v6stat.total = 0;
   v6stat.zero_pos = -1;
@@ -1305,7 +1338,7 @@ static int ipv6_cb(const char *elem, size_t len, void *usr) {
 
 // Convert a string of up to 4 hex digits into the corresponding IPv6 form.
 
-static int ipv6_hex(unsigned char *out, const char *in, size_t inlen) {
+static int ipv6_hex(uint8_t *out, const char *in, size_t inlen) {
   if (inlen > 4) {
     return 0;
   }
diff --git a/Sources/CCryptoBoringSSL/crypto/x509/x509_lu.c b/Sources/CCryptoBoringSSL/crypto/x509/x509_lu.c
index 75444603..29c22d83 100644
--- a/Sources/CCryptoBoringSSL/crypto/x509/x509_lu.c
+++ b/Sources/CCryptoBoringSSL/crypto/x509/x509_lu.c
@@ -594,16 +594,6 @@ void X509_STORE_set_verify_cb(X509_STORE *ctx,
   ctx->verify_cb = verify_cb;
 }
 
-void X509_STORE_set_get_crl(X509_STORE *ctx,
-                            X509_STORE_CTX_get_crl_fn get_crl) {
-  ctx->get_crl = get_crl;
-}
-
-void X509_STORE_set_check_crl(X509_STORE *ctx,
-                              X509_STORE_CTX_check_crl_fn check_crl) {
-  ctx->check_crl = check_crl;
-}
-
 X509_STORE *X509_STORE_CTX_get0_store(const X509_STORE_CTX *ctx) {
   return ctx->ctx;
 }
diff --git a/Sources/CCryptoBoringSSL/crypto/x509/x509_trs.c b/Sources/CCryptoBoringSSL/crypto/x509/x509_trs.c
index 9d5118d3..af5a5b5e 100644
--- a/Sources/CCryptoBoringSSL/crypto/x509/x509_trs.c
+++ b/Sources/CCryptoBoringSSL/crypto/x509/x509_trs.c
@@ -67,14 +67,14 @@ typedef struct x509_trust_st X509_TRUST;
 
 struct x509_trust_st {
   int trust;
-  int (*check_trust)(const X509_TRUST *, X509 *, int);
+  int (*check_trust)(const X509_TRUST *, X509 *);
   int nid;
 } /* X509_TRUST */;
 
-static int trust_1oidany(const X509_TRUST *trust, X509 *x, int flags);
-static int trust_compat(const X509_TRUST *trust, X509 *x, int flags);
+static int trust_1oidany(const X509_TRUST *trust, X509 *x);
+static int trust_compat(const X509_TRUST *trust, X509 *x);
 
-static int obj_trust(int id, X509 *x, int flags);
+static int obj_trust(int id, X509 *x);
 
 static const X509_TRUST trstandard[] = {
     {X509_TRUST_COMPAT, trust_compat, 0},
@@ -99,36 +99,36 @@ int X509_check_trust(X509 *x, int id, int flags) {
   }
   // We get this as a default value
   if (id == 0) {
-    int rv = obj_trust(NID_anyExtendedKeyUsage, x, 0);
+    int rv = obj_trust(NID_anyExtendedKeyUsage, x);
     if (rv != X509_TRUST_UNTRUSTED) {
       return rv;
     }
-    return trust_compat(NULL, x, 0);
+    return trust_compat(NULL, x);
   }
   const X509_TRUST *pt = X509_TRUST_get0(id);
   if (pt == NULL) {
     // Unknown trust IDs are silently reintrepreted as NIDs. This is unreachable
     // from the certificate verifier itself, but wpa_supplicant relies on it.
     // Note this relies on commonly-used NIDs and trust IDs not colliding.
-    return obj_trust(id, x, flags);
+    return obj_trust(id, x);
   }
-  return pt->check_trust(pt, x, flags);
+  return pt->check_trust(pt, x);
 }
 
 int X509_is_valid_trust_id(int trust) {
   return X509_TRUST_get0(trust) != NULL;
 }
 
-static int trust_1oidany(const X509_TRUST *trust, X509 *x, int flags) {
+static int trust_1oidany(const X509_TRUST *trust, X509 *x) {
   if (x->aux && (x->aux->trust || x->aux->reject)) {
-    return obj_trust(trust->nid, x, flags);
+    return obj_trust(trust->nid, x);
   }
   // we don't have any trust settings: for compatibility we return trusted
   // if it is self signed
-  return trust_compat(trust, x, flags);
+  return trust_compat(trust, x);
 }
 
-static int trust_compat(const X509_TRUST *trust, X509 *x, int flags) {
+static int trust_compat(const X509_TRUST *trust, X509 *x) {
   if (!x509v3_cache_extensions(x)) {
     return X509_TRUST_UNTRUSTED;
   }
@@ -139,25 +139,21 @@ static int trust_compat(const X509_TRUST *trust, X509 *x, int flags) {
   }
 }
 
-static int obj_trust(int id, X509 *x, int flags) {
+static int obj_trust(int id, X509 *x) {
   X509_CERT_AUX *ax = x->aux;
   if (!ax) {
     return X509_TRUST_UNTRUSTED;
   }
-  if (ax->reject) {
-    for (size_t i = 0; i < sk_ASN1_OBJECT_num(ax->reject); i++) {
-      const ASN1_OBJECT *obj = sk_ASN1_OBJECT_value(ax->reject, i);
-      if (OBJ_obj2nid(obj) == id) {
-        return X509_TRUST_REJECTED;
-      }
+  for (size_t i = 0; i < sk_ASN1_OBJECT_num(ax->reject); i++) {
+    const ASN1_OBJECT *obj = sk_ASN1_OBJECT_value(ax->reject, i);
+    if (OBJ_obj2nid(obj) == id) {
+      return X509_TRUST_REJECTED;
     }
   }
-  if (ax->trust) {
-    for (size_t i = 0; i < sk_ASN1_OBJECT_num(ax->trust); i++) {
-      const ASN1_OBJECT *obj = sk_ASN1_OBJECT_value(ax->trust, i);
-      if (OBJ_obj2nid(obj) == id) {
-        return X509_TRUST_TRUSTED;
-      }
+  for (size_t i = 0; i < sk_ASN1_OBJECT_num(ax->trust); i++) {
+    const ASN1_OBJECT *obj = sk_ASN1_OBJECT_value(ax->trust, i);
+    if (OBJ_obj2nid(obj) == id) {
+      return X509_TRUST_TRUSTED;
     }
   }
   return X509_TRUST_UNTRUSTED;
diff --git a/Sources/CCryptoBoringSSL/crypto/x509/x509_vfy.c b/Sources/CCryptoBoringSSL/crypto/x509/x509_vfy.c
index e9b290a9..20f26aa3 100644
--- a/Sources/CCryptoBoringSSL/crypto/x509/x509_vfy.c
+++ b/Sources/CCryptoBoringSSL/crypto/x509/x509_vfy.c
@@ -117,6 +117,7 @@ static int get_crl(X509_STORE_CTX *ctx, X509_CRL **pcrl, X509 *x);
 static int crl_akid_check(X509_STORE_CTX *ctx, X509_CRL *crl, X509 **pissuer,
                           int *pcrl_score);
 static int crl_crldp_check(X509 *x, X509_CRL *crl, int crl_score);
+static int check_crl(X509_STORE_CTX *ctx, X509_CRL *crl);
 static int cert_crl(X509_STORE_CTX *ctx, X509_CRL *crl, X509 *x);
 
 static int internal_verify(X509_STORE_CTX *ctx);
@@ -769,17 +770,18 @@ static int check_cert(X509_STORE_CTX *ctx) {
   // Try to retrieve the relevant CRL. Note that |get_crl| sets
   // |current_crl_issuer| and |current_crl_score|, which |check_crl| then reads.
   //
-  // TODO(davidben): Remove these callbacks. gRPC currently sets them, but
-  // implements them incorrectly. It is not actually possible to implement
-  // |get_crl| from outside the library.
-  if (!ctx->get_crl(ctx, &crl, x)) {
+  // TODO(davidben): The awkward internal calling convention is a historical
+  // artifact of when these functions were user-overridable callbacks, even
+  // though there was no way to set them correctly. These callbacks have since
+  // been removed, so we can pass input and output parameters more directly.
+  if (!get_crl(ctx, &crl, x)) {
     ctx->error = X509_V_ERR_UNABLE_TO_GET_CRL;
     ok = call_verify_cb(0, ctx);
     goto err;
   }
 
   ctx->current_crl = crl;
-  if (!ctx->check_crl(ctx, crl) ||  //
+  if (!check_crl(ctx, crl) ||  //
       !cert_crl(ctx, crl, x)) {
     goto err;
   }
@@ -1560,18 +1562,6 @@ int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *store, X509 *x509,
     ctx->verify_cb = null_callback;
   }
 
-  if (store->get_crl) {
-    ctx->get_crl = store->get_crl;
-  } else {
-    ctx->get_crl = get_crl;
-  }
-
-  if (store->check_crl) {
-    ctx->check_crl = store->check_crl;
-  } else {
-    ctx->check_crl = check_crl;
-  }
-
   return 1;
 
 err:
diff --git a/Sources/CCryptoBoringSSL/crypto/x509/x_algor.c b/Sources/CCryptoBoringSSL/crypto/x509/x_algor.c
index 8f4cc9aa..c9c61bcf 100644
--- a/Sources/CCryptoBoringSSL/crypto/x509/x_algor.c
+++ b/Sources/CCryptoBoringSSL/crypto/x509/x_algor.c
@@ -123,7 +123,7 @@ void X509_ALGOR_get0(const ASN1_OBJECT **out_obj, int *out_param_type,
 
 // Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD
 
-void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md) {
+int X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md) {
   int param_type;
 
   if (EVP_MD_flags(md) & EVP_MD_FLAG_DIGALGID_ABSENT) {
@@ -132,7 +132,7 @@ void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md) {
     param_type = V_ASN1_NULL;
   }
 
-  X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
+  return X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
 }
 
 // X509_ALGOR_cmp returns 0 if |a| and |b| are equal and non-zero otherwise.
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-apple.S
index 05e38f93..0d61efe4 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -868,7 +867,6 @@ L$one_lsb:
 .p2align	6
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-linux.S
index 1d65cc35..276d820a 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-gcm-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -883,7 +882,6 @@ _CET_ENDBR
 .align	64
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-apple.S
new file mode 100644
index 00000000..99b6bae4
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-apple.S
@@ -0,0 +1,2495 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.globl	_aes_hw_encrypt
+.private_extern	_aes_hw_encrypt
+.align	4
+_aes_hw_encrypt:
+L_aes_hw_encrypt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L000pic_for_function_hit
+L000pic_for_function_hit:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+1-L000pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L001enc1_loop_1:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L001enc1_loop_1
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
+	ret
+.globl	_aes_hw_decrypt
+.private_extern	_aes_hw_decrypt
+.align	4
+_aes_hw_decrypt:
+L_aes_hw_decrypt_begin:
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L002dec1_loop_2:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L002dec1_loop_2
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
+	ret
+.private_extern	__aesni_encrypt2
+.align	4
+__aesni_encrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+L003enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L003enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+.private_extern	__aesni_decrypt2
+.align	4
+__aesni_decrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+L004dec2_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L004dec2_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+	ret
+.private_extern	__aesni_encrypt3
+.align	4
+__aesni_encrypt3:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+L005enc3_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L005enc3_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+.private_extern	__aesni_decrypt3
+.align	4
+__aesni_decrypt3:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+L006dec3_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L006dec3_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	ret
+.private_extern	__aesni_encrypt4
+.align	4
+__aesni_encrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shll	$4,%ecx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+L007enc4_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L007enc4_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+.private_extern	__aesni_decrypt4
+.align	4
+__aesni_decrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shll	$4,%ecx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+L008dec4_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L008dec4_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	ret
+.private_extern	__aesni_encrypt6
+.align	4
+__aesni_encrypt6:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,217
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm7
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	L009_aesni_encrypt6_inner
+.align	4,0x90
+L010enc6_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+L009_aesni_encrypt6_inner:
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+L_aesni_encrypt6_enter:
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L010enc6_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+.private_extern	__aesni_decrypt6
+.align	4
+__aesni_decrypt6:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,217
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm7
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	L011_aesni_decrypt6_inner
+.align	4,0x90
+L012dec6_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+L011_aesni_decrypt6_inner:
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+L_aesni_decrypt6_enter:
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L012dec6_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	ret
+.globl	_aes_hw_ecb_encrypt
+.private_extern	_aes_hw_ecb_encrypt
+.align	4
+_aes_hw_ecb_encrypt:
+L_aes_hw_ecb_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	andl	$-16,%eax
+	jz	L013ecb_ret
+	movl	240(%edx),%ecx
+	testl	%ebx,%ebx
+	jz	L014ecb_decrypt
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	L015ecb_enc_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	L016ecb_enc_loop6_enter
+.align	4,0x90
+L017ecb_enc_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+L016ecb_enc_loop6_enter:
+	call	__aesni_encrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	L017ecb_enc_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	L013ecb_ret
+L015ecb_enc_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	L018ecb_enc_one
+	movups	16(%esi),%xmm3
+	je	L019ecb_enc_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	L020ecb_enc_three
+	movups	48(%esi),%xmm5
+	je	L021ecb_enc_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	__aesni_encrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L018ecb_enc_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L022enc1_loop_3:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L022enc1_loop_3
+.byte	102,15,56,221,209
+	movups	%xmm2,(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L019ecb_enc_two:
+	call	__aesni_encrypt2
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L020ecb_enc_three:
+	call	__aesni_encrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L021ecb_enc_four:
+	call	__aesni_encrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L014ecb_decrypt:
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	L023ecb_dec_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	L024ecb_dec_loop6_enter
+.align	4,0x90
+L025ecb_dec_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+L024ecb_dec_loop6_enter:
+	call	__aesni_decrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	L025ecb_dec_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	L013ecb_ret
+L023ecb_dec_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	L026ecb_dec_one
+	movups	16(%esi),%xmm3
+	je	L027ecb_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	L028ecb_dec_three
+	movups	48(%esi),%xmm5
+	je	L029ecb_dec_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	__aesni_decrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L026ecb_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L030dec1_loop_4:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L030dec1_loop_4
+.byte	102,15,56,223,209
+	movups	%xmm2,(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L027ecb_dec_two:
+	call	__aesni_decrypt2
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L028ecb_dec_three:
+	call	__aesni_decrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	L013ecb_ret
+.align	4,0x90
+L029ecb_dec_four:
+	call	__aesni_decrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+L013ecb_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_ccm64_encrypt_blocks
+.private_extern	_aes_hw_ccm64_encrypt_blocks
+.align	4
+_aes_hw_ccm64_encrypt_blocks:
+L_aes_hw_ccm64_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	shll	$4,%ecx
+	movl	$16,%ebx
+	leal	(%edx),%ebp
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	leal	32(%edx,%ecx,1),%edx
+	subl	%ecx,%ebx
+.byte	102,15,56,0,253
+L031ccm64_enc_outer:
+	movups	(%ebp),%xmm0
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm6
+	xorps	%xmm0,%xmm2
+	movups	16(%ebp),%xmm1
+	xorps	%xmm6,%xmm0
+	xorps	%xmm0,%xmm3
+	movups	32(%ebp),%xmm0
+L032ccm64_enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L032ccm64_enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	paddq	16(%esp),%xmm7
+	decl	%eax
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	leal	16(%esi),%esi
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movups	%xmm6,(%edi)
+.byte	102,15,56,0,213
+	leal	16(%edi),%edi
+	jnz	L031ccm64_enc_outer
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_ccm64_decrypt_blocks
+.private_extern	_aes_hw_ccm64_decrypt_blocks
+.align	4
+_aes_hw_ccm64_decrypt_blocks:
+L_aes_hw_ccm64_decrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+.byte	102,15,56,0,253
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L033enc1_loop_5:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L033enc1_loop_5
+.byte	102,15,56,221,209
+	shll	$4,%ebx
+	movl	$16,%ecx
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+	leal	16(%esi),%esi
+	subl	%ebx,%ecx
+	leal	32(%ebp,%ebx,1),%edx
+	movl	%ecx,%ebx
+	jmp	L034ccm64_dec_outer
+.align	4,0x90
+L034ccm64_dec_outer:
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movups	%xmm6,(%edi)
+	leal	16(%edi),%edi
+.byte	102,15,56,0,213
+	subl	$1,%eax
+	jz	L035ccm64_dec_break
+	movups	(%ebp),%xmm0
+	movl	%ebx,%ecx
+	movups	16(%ebp),%xmm1
+	xorps	%xmm0,%xmm6
+	xorps	%xmm0,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	32(%ebp),%xmm0
+L036ccm64_dec2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	L036ccm64_dec2_loop
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	leal	16(%esi),%esi
+	jmp	L034ccm64_dec_outer
+.align	4,0x90
+L035ccm64_dec_break:
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm6
+	leal	32(%edx),%edx
+	xorps	%xmm6,%xmm3
+L037enc1_loop_6:
+.byte	102,15,56,220,217
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L037enc1_loop_6
+.byte	102,15,56,221,217
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_ctr32_encrypt_blocks
+.private_extern	_aes_hw_ctr32_encrypt_blocks
+.align	4
+_aes_hw_ctr32_encrypt_blocks:
+L_aes_hw_ctr32_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L038pic_for_function_hit
+L038pic_for_function_hit:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+0-L038pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$88,%esp
+	andl	$-16,%esp
+	movl	%ebp,80(%esp)
+	cmpl	$1,%eax
+	je	L039ctr32_one_shortcut
+	movdqu	(%ebx),%xmm7
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$6,%ecx
+	xorl	%ebp,%ebp
+	movl	%ecx,16(%esp)
+	movl	%ecx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%ebp,28(%esp)
+.byte	102,15,58,22,251,3
+.byte	102,15,58,34,253,3
+	movl	240(%edx),%ecx
+	bswap	%ebx
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movdqa	(%esp),%xmm2
+.byte	102,15,58,34,195,0
+	leal	3(%ebx),%ebp
+.byte	102,15,58,34,205,0
+	incl	%ebx
+.byte	102,15,58,34,195,1
+	incl	%ebp
+.byte	102,15,58,34,205,1
+	incl	%ebx
+.byte	102,15,58,34,195,2
+	incl	%ebp
+.byte	102,15,58,34,205,2
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	movdqu	(%edx),%xmm6
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
+	pshufd	$192,%xmm0,%xmm2
+	pshufd	$128,%xmm0,%xmm3
+	cmpl	$6,%eax
+	jb	L040ctr32_tail
+	pxor	%xmm6,%xmm7
+	shll	$4,%ecx
+	movl	$16,%ebx
+	movdqa	%xmm7,32(%esp)
+	movl	%edx,%ebp
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	subl	$6,%eax
+	jmp	L041ctr32_loop6
+.align	4,0x90
+L041ctr32_loop6:
+	pshufd	$64,%xmm0,%xmm4
+	movdqa	32(%esp),%xmm0
+	pshufd	$192,%xmm1,%xmm5
+	pxor	%xmm0,%xmm2
+	pshufd	$128,%xmm1,%xmm6
+	pxor	%xmm0,%xmm3
+	pshufd	$64,%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,217
+	movups	32(%ebp),%xmm0
+	movl	%ebx,%ecx
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	call	L_aesni_encrypt6_enter
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	%xmm2,(%edi)
+	movdqa	16(%esp),%xmm0
+	xorps	%xmm1,%xmm4
+	movdqa	64(%esp),%xmm1
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	paddd	%xmm0,%xmm1
+	paddd	48(%esp),%xmm0
+	movdqa	(%esp),%xmm2
+	movups	48(%esi),%xmm3
+	movups	64(%esi),%xmm4
+	xorps	%xmm3,%xmm5
+	movups	80(%esi),%xmm3
+	leal	96(%esi),%esi
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	xorps	%xmm4,%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm3,%xmm7
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
+	movups	%xmm6,64(%edi)
+	pshufd	$192,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	pshufd	$128,%xmm0,%xmm3
+	subl	$6,%eax
+	jnc	L041ctr32_loop6
+	addl	$6,%eax
+	jz	L042ctr32_ret
+	movdqu	(%ebp),%xmm7
+	movl	%ebp,%edx
+	pxor	32(%esp),%xmm7
+	movl	240(%ebp),%ecx
+L040ctr32_tail:
+	por	%xmm7,%xmm2
+	cmpl	$2,%eax
+	jb	L043ctr32_one
+	pshufd	$64,%xmm0,%xmm4
+	por	%xmm7,%xmm3
+	je	L044ctr32_two
+	pshufd	$192,%xmm1,%xmm5
+	por	%xmm7,%xmm4
+	cmpl	$4,%eax
+	jb	L045ctr32_three
+	pshufd	$128,%xmm1,%xmm6
+	por	%xmm7,%xmm5
+	je	L046ctr32_four
+	por	%xmm7,%xmm6
+	call	__aesni_encrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm4
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	L042ctr32_ret
+.align	4,0x90
+L039ctr32_one_shortcut:
+	movups	(%ebx),%xmm2
+	movl	240(%edx),%ecx
+L043ctr32_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L047enc1_loop_7:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L047enc1_loop_7
+.byte	102,15,56,221,209
+	movups	(%esi),%xmm6
+	xorps	%xmm2,%xmm6
+	movups	%xmm6,(%edi)
+	jmp	L042ctr32_ret
+.align	4,0x90
+L044ctr32_two:
+	call	__aesni_encrypt2
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	L042ctr32_ret
+.align	4,0x90
+L045ctr32_three:
+	call	__aesni_encrypt3
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	movups	32(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	L042ctr32_ret
+.align	4,0x90
+L046ctr32_four:
+	call	__aesni_encrypt4
+	movups	(%esi),%xmm6
+	movups	16(%esi),%xmm7
+	movups	32(%esi),%xmm1
+	xorps	%xmm6,%xmm2
+	movups	48(%esi),%xmm0
+	xorps	%xmm7,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+L042ctr32_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movl	80(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_xts_encrypt
+.private_extern	_aes_hw_xts_encrypt
+.align	4
+_aes_hw_xts_encrypt:
+L_aes_hw_xts_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L048enc1_loop_8:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L048enc1_loop_8
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	movl	240(%edx),%ecx
+	andl	$-16,%esp
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	subl	$96,%eax
+	jc	L049xts_enc_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	L050xts_enc_loop6
+.align	4,0x90
+L050xts_enc_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,220,209
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	call	L_aesni_encrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	L050xts_enc_loop6
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+L049xts_enc_short:
+	addl	$96,%eax
+	jz	L051xts_enc_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	L052xts_enc_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	L053xts_enc_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	L054xts_enc_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	L055xts_enc_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	__aesni_encrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	L056xts_enc_done
+.align	4,0x90
+L052xts_enc_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L057enc1_loop_9:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L057enc1_loop_9
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	L056xts_enc_done
+.align	4,0x90
+L053xts_enc_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	call	__aesni_encrypt2
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	L056xts_enc_done
+.align	4,0x90
+L054xts_enc_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	__aesni_encrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	L056xts_enc_done
+.align	4,0x90
+L055xts_enc_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	__aesni_encrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	L056xts_enc_done
+.align	4,0x90
+L051xts_enc_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	L058xts_enc_ret
+	movdqa	%xmm1,%xmm5
+	movl	%eax,112(%esp)
+	jmp	L059xts_enc_steal
+.align	4,0x90
+L056xts_enc_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	L058xts_enc_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm5
+	paddq	%xmm1,%xmm1
+	pand	96(%esp),%xmm5
+	pxor	%xmm1,%xmm5
+L059xts_enc_steal:
+	movzbl	(%esi),%ecx
+	movzbl	-16(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,-16(%edi)
+	movb	%dl,(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	L059xts_enc_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	-16(%edi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L060enc1_loop_10:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L060enc1_loop_10
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,-16(%edi)
+L058xts_enc_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_xts_decrypt
+.private_extern	_aes_hw_xts_decrypt
+.align	4
+_aes_hw_xts_decrypt:
+L_aes_hw_xts_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L061enc1_loop_11:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L061enc1_loop_11
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	andl	$-16,%esp
+	xorl	%ebx,%ebx
+	testl	$15,%eax
+	setnz	%bl
+	shll	$4,%ebx
+	subl	%ebx,%eax
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	subl	$96,%eax
+	jc	L062xts_dec_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	L063xts_dec_loop6
+.align	4,0x90
+L063xts_dec_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,222,209
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	call	L_aesni_decrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	L063xts_dec_loop6
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+L062xts_dec_short:
+	addl	$96,%eax
+	jz	L064xts_dec_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	L065xts_dec_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	L066xts_dec_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	L067xts_dec_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	L068xts_dec_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	__aesni_decrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	L069xts_dec_done
+.align	4,0x90
+L065xts_dec_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L070dec1_loop_12:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L070dec1_loop_12
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	L069xts_dec_done
+.align	4,0x90
+L066xts_dec_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	call	__aesni_decrypt2
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	L069xts_dec_done
+.align	4,0x90
+L067xts_dec_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	__aesni_decrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	L069xts_dec_done
+.align	4,0x90
+L068xts_dec_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	__aesni_decrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	L069xts_dec_done
+.align	4,0x90
+L064xts_dec_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	L071xts_dec_ret
+	movl	%eax,112(%esp)
+	jmp	L072xts_dec_only_one_more
+.align	4,0x90
+L069xts_dec_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	L071xts_dec_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+L072xts_dec_only_one_more:
+	pshufd	$19,%xmm0,%xmm5
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm5
+	pxor	%xmm1,%xmm5
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L073dec1_loop_13:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L073dec1_loop_13
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+L074xts_dec_steal:
+	movzbl	16(%esi),%ecx
+	movzbl	(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,(%edi)
+	movb	%dl,16(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	L074xts_dec_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%edi),%xmm2
+	xorps	%xmm6,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L075dec1_loop_14:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L075dec1_loop_14
+.byte	102,15,56,223,209
+	xorps	%xmm6,%xmm2
+	movups	%xmm2,(%edi)
+L071xts_dec_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_cbc_encrypt
+.private_extern	_aes_hw_cbc_encrypt
+.align	4
+_aes_hw_cbc_encrypt:
+L_aes_hw_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	%esp,%ebx
+	movl	24(%esp),%edi
+	subl	$24,%ebx
+	movl	28(%esp),%eax
+	andl	$-16,%ebx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebp
+	testl	%eax,%eax
+	jz	L076cbc_abort
+	cmpl	$0,40(%esp)
+	xchgl	%esp,%ebx
+	movups	(%ebp),%xmm7
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ecx,%ebx
+	je	L077cbc_decrypt
+	movaps	%xmm7,%xmm2
+	cmpl	$16,%eax
+	jb	L078cbc_enc_tail
+	subl	$16,%eax
+	jmp	L079cbc_enc_loop
+.align	4,0x90
+L079cbc_enc_loop:
+	movups	(%esi),%xmm7
+	leal	16(%esi),%esi
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm7
+	leal	32(%edx),%edx
+	xorps	%xmm7,%xmm2
+L080enc1_loop_15:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L080enc1_loop_15
+.byte	102,15,56,221,209
+	movl	%ebx,%ecx
+	movl	%ebp,%edx
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	subl	$16,%eax
+	jnc	L079cbc_enc_loop
+	addl	$16,%eax
+	jnz	L078cbc_enc_tail
+	movaps	%xmm2,%xmm7
+	pxor	%xmm2,%xmm2
+	jmp	L081cbc_ret
+L078cbc_enc_tail:
+	movl	%eax,%ecx
+.long	2767451785
+	movl	$16,%ecx
+	subl	%eax,%ecx
+	xorl	%eax,%eax
+.long	2868115081
+	leal	-16(%edi),%edi
+	movl	%ebx,%ecx
+	movl	%edi,%esi
+	movl	%ebp,%edx
+	jmp	L079cbc_enc_loop
+.align	4,0x90
+L077cbc_decrypt:
+	cmpl	$80,%eax
+	jbe	L082cbc_dec_tail
+	movaps	%xmm7,(%esp)
+	subl	$80,%eax
+	jmp	L083cbc_dec_loop6_enter
+.align	4,0x90
+L084cbc_dec_loop6:
+	movaps	%xmm0,(%esp)
+	movups	%xmm7,(%edi)
+	leal	16(%edi),%edi
+L083cbc_dec_loop6_enter:
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	call	__aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%esi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	96(%esi),%esi
+	movups	%xmm4,32(%edi)
+	movl	%ebx,%ecx
+	movups	%xmm5,48(%edi)
+	movl	%ebp,%edx
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	subl	$96,%eax
+	ja	L084cbc_dec_loop6
+	movaps	%xmm7,%xmm2
+	movaps	%xmm0,%xmm7
+	addl	$80,%eax
+	jle	L085cbc_dec_clear_tail_collected
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+L082cbc_dec_tail:
+	movups	(%esi),%xmm2
+	movaps	%xmm2,%xmm6
+	cmpl	$16,%eax
+	jbe	L086cbc_dec_one
+	movups	16(%esi),%xmm3
+	movaps	%xmm3,%xmm5
+	cmpl	$32,%eax
+	jbe	L087cbc_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$48,%eax
+	jbe	L088cbc_dec_three
+	movups	48(%esi),%xmm5
+	cmpl	$64,%eax
+	jbe	L089cbc_dec_four
+	movups	64(%esi),%xmm6
+	movaps	%xmm7,(%esp)
+	movups	(%esi),%xmm2
+	xorps	%xmm7,%xmm7
+	call	__aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm7
+	xorps	%xmm0,%xmm6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%edi)
+	pxor	%xmm5,%xmm5
+	leal	64(%edi),%edi
+	movaps	%xmm6,%xmm2
+	pxor	%xmm6,%xmm6
+	subl	$80,%eax
+	jmp	L090cbc_dec_tail_collected
+.align	4,0x90
+L086cbc_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+L091dec1_loop_16:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	L091dec1_loop_16
+.byte	102,15,56,223,209
+	xorps	%xmm7,%xmm2
+	movaps	%xmm6,%xmm7
+	subl	$16,%eax
+	jmp	L090cbc_dec_tail_collected
+.align	4,0x90
+L087cbc_dec_two:
+	call	__aesni_decrypt2
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movaps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	leal	16(%edi),%edi
+	movaps	%xmm5,%xmm7
+	subl	$32,%eax
+	jmp	L090cbc_dec_tail_collected
+.align	4,0x90
+L088cbc_dec_three:
+	call	__aesni_decrypt3
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm5,%xmm4
+	movups	%xmm2,(%edi)
+	movaps	%xmm4,%xmm2
+	pxor	%xmm4,%xmm4
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	leal	32(%edi),%edi
+	movups	32(%esi),%xmm7
+	subl	$48,%eax
+	jmp	L090cbc_dec_tail_collected
+.align	4,0x90
+L089cbc_dec_four:
+	call	__aesni_decrypt4
+	movups	16(%esi),%xmm1
+	movups	32(%esi),%xmm0
+	xorps	%xmm7,%xmm2
+	movups	48(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
+	leal	48(%edi),%edi
+	movaps	%xmm5,%xmm2
+	pxor	%xmm5,%xmm5
+	subl	$64,%eax
+	jmp	L090cbc_dec_tail_collected
+.align	4,0x90
+L085cbc_dec_clear_tail_collected:
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+L090cbc_dec_tail_collected:
+	andl	$15,%eax
+	jnz	L092cbc_dec_tail_partial
+	movups	%xmm2,(%edi)
+	pxor	%xmm0,%xmm0
+	jmp	L081cbc_ret
+.align	4,0x90
+L092cbc_dec_tail_partial:
+	movaps	%xmm2,(%esp)
+	pxor	%xmm0,%xmm0
+	movl	$16,%ecx
+	movl	%esp,%esi
+	subl	%eax,%ecx
+.long	2767451785
+	movdqa	%xmm2,(%esp)
+L081cbc_ret:
+	movl	16(%esp),%esp
+	movl	36(%esp),%ebp
+	pxor	%xmm2,%xmm2
+	pxor	%xmm1,%xmm1
+	movups	%xmm7,(%ebp)
+	pxor	%xmm7,%xmm7
+L076cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_aes_hw_set_encrypt_key_base
+.private_extern	_aes_hw_set_encrypt_key_base
+.align	4
+_aes_hw_set_encrypt_key_base:
+L_aes_hw_set_encrypt_key_base_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L093pic_for_function_hit
+L093pic_for_function_hit:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+3-L093pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	pushl	%ebx
+	call	L094pic
+L094pic:
+	popl	%ebx
+	leal	Lkey_const-L094pic(%ebx),%ebx
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	leal	16(%edx),%edx
+	cmpl	$256,%ecx
+	je	L09514rounds
+	cmpl	$192,%ecx
+	je	L09612rounds
+	cmpl	$128,%ecx
+	jne	L097bad_keybits
+.align	4,0x90
+L09810rounds:
+	movl	$9,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,200,1
+	call	L099key_128_cold
+.byte	102,15,58,223,200,2
+	call	L100key_128
+.byte	102,15,58,223,200,4
+	call	L100key_128
+.byte	102,15,58,223,200,8
+	call	L100key_128
+.byte	102,15,58,223,200,16
+	call	L100key_128
+.byte	102,15,58,223,200,32
+	call	L100key_128
+.byte	102,15,58,223,200,64
+	call	L100key_128
+.byte	102,15,58,223,200,128
+	call	L100key_128
+.byte	102,15,58,223,200,27
+	call	L100key_128
+.byte	102,15,58,223,200,54
+	call	L100key_128
+	movups	%xmm0,(%edx)
+	movl	%ecx,80(%edx)
+	jmp	L101good_key
+.align	4,0x90
+L100key_128:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+L099key_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	4,0x90
+L09612rounds:
+	movq	16(%eax),%xmm2
+	movl	$11,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	L102key_192a_cold
+.byte	102,15,58,223,202,2
+	call	L103key_192b
+.byte	102,15,58,223,202,4
+	call	L104key_192a
+.byte	102,15,58,223,202,8
+	call	L103key_192b
+.byte	102,15,58,223,202,16
+	call	L104key_192a
+.byte	102,15,58,223,202,32
+	call	L103key_192b
+.byte	102,15,58,223,202,64
+	call	L104key_192a
+.byte	102,15,58,223,202,128
+	call	L103key_192b
+	movups	%xmm0,(%edx)
+	movl	%ecx,48(%edx)
+	jmp	L101good_key
+.align	4,0x90
+L104key_192a:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.align	4,0x90
+L102key_192a_cold:
+	movaps	%xmm2,%xmm5
+L105key_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+.align	4,0x90
+L103key_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%edx)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%edx)
+	leal	32(%edx),%edx
+	jmp	L105key_192b_warm
+.align	4,0x90
+L09514rounds:
+	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	movl	$13,%ecx
+	movups	%xmm0,-32(%edx)
+	movups	%xmm2,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	L106key_256a_cold
+.byte	102,15,58,223,200,1
+	call	L107key_256b
+.byte	102,15,58,223,202,2
+	call	L108key_256a
+.byte	102,15,58,223,200,2
+	call	L107key_256b
+.byte	102,15,58,223,202,4
+	call	L108key_256a
+.byte	102,15,58,223,200,4
+	call	L107key_256b
+.byte	102,15,58,223,202,8
+	call	L108key_256a
+.byte	102,15,58,223,200,8
+	call	L107key_256b
+.byte	102,15,58,223,202,16
+	call	L108key_256a
+.byte	102,15,58,223,200,16
+	call	L107key_256b
+.byte	102,15,58,223,202,32
+	call	L108key_256a
+.byte	102,15,58,223,200,32
+	call	L107key_256b
+.byte	102,15,58,223,202,64
+	call	L108key_256a
+	movups	%xmm0,(%edx)
+	movl	%ecx,16(%edx)
+	xorl	%eax,%eax
+	jmp	L101good_key
+.align	4,0x90
+L108key_256a:
+	movups	%xmm2,(%edx)
+	leal	16(%edx),%edx
+L106key_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	4,0x90
+L107key_256b:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+L101good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	ret
+.align	2,0x90
+L097bad_keybits:
+	pxor	%xmm0,%xmm0
+	movl	$-2,%eax
+	popl	%ebx
+	ret
+.globl	_aes_hw_set_encrypt_key_alt
+.private_extern	_aes_hw_set_encrypt_key_alt
+.align	4
+_aes_hw_set_encrypt_key_alt:
+L_aes_hw_set_encrypt_key_alt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L109pic_for_function_hit
+L109pic_for_function_hit:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+3-L109pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	pushl	%ebx
+	call	L110pic
+L110pic:
+	popl	%ebx
+	leal	Lkey_const-L110pic(%ebx),%ebx
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	leal	16(%edx),%edx
+	cmpl	$256,%ecx
+	je	L11114rounds_alt
+	cmpl	$192,%ecx
+	je	L11212rounds_alt
+	cmpl	$128,%ecx
+	jne	L113bad_keybits
+.align	4,0x90
+L11410rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movl	$8,%ecx
+	movdqa	32(%ebx),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,-16(%edx)
+L115loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leal	16(%edx),%edx
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%edx)
+	movdqa	%xmm0,%xmm2
+	decl	%ecx
+	jnz	L115loop_key128
+	movdqa	48(%ebx),%xmm4
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%edx)
+	movl	$9,%ecx
+	movl	%ecx,96(%edx)
+	jmp	L116good_key
+.align	4,0x90
+L11212rounds_alt:
+	movq	16(%eax),%xmm2
+	movdqa	16(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$8,%ecx
+	movdqu	%xmm0,-16(%edx)
+L117loop_key192:
+	movq	%xmm2,(%edx)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leal	24(%edx),%edx
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%edx)
+	decl	%ecx
+	jnz	L117loop_key192
+	movl	$11,%ecx
+	movl	%ecx,32(%edx)
+	jmp	L116good_key
+.align	4,0x90
+L11114rounds_alt:
+	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	movdqa	(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$7,%ecx
+	movdqu	%xmm0,-32(%edx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,-16(%edx)
+L118loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	decl	%ecx
+	jz	L119done_key256
+	pshufd	$255,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%edx)
+	leal	32(%edx),%edx
+	movdqa	%xmm2,%xmm1
+	jmp	L118loop_key256
+L119done_key256:
+	movl	$13,%ecx
+	movl	%ecx,16(%edx)
+L116good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	ret
+.align	2,0x90
+L113bad_keybits:
+	pxor	%xmm0,%xmm0
+	movl	$-2,%eax
+	popl	%ebx
+	ret
+.globl	_aes_hw_encrypt_key_to_decrypt_key
+.private_extern	_aes_hw_encrypt_key_to_decrypt_key
+.align	4
+_aes_hw_encrypt_key_to_decrypt_key:
+L_aes_hw_encrypt_key_to_decrypt_key_begin:
+	movl	4(%esp),%edx
+	movl	240(%edx),%ecx
+	shll	$4,%ecx
+	leal	16(%edx,%ecx,1),%eax
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+	movups	%xmm0,(%eax)
+	movups	%xmm1,(%edx)
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+L120dec_key_inverse:
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+	movups	%xmm0,16(%eax)
+	movups	%xmm1,-16(%edx)
+	cmpl	%edx,%eax
+	ja	L120dec_key_inverse
+	movups	(%edx),%xmm0
+.byte	102,15,56,219,192
+	movups	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	ret
+.align	6,0x90
+Lkey_const:
+.long	202313229,202313229,202313229,202313229
+.long	67569157,67569157,67569157,67569157
+.long	1,1,1,1
+.long	27,27,27,27
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte	115,108,46,111,114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-linux.S
similarity index 93%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-linux.S
index 8d6cf17d..0c76eb2d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -18,10 +17,10 @@ aes_hw_encrypt:
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	.L000pic
-.L000pic:
+	call	.L000pic_for_function_hit
+.L000pic_for_function_hit:
 	popl	%ebx
-	leal	BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx
+	leal	BORINGSSL_function_hit+1-.L000pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
@@ -849,10 +848,10 @@ aes_hw_ctr32_encrypt_blocks:
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	.L038pic
-.L038pic:
+	call	.L038pic_for_function_hit
+.L038pic_for_function_hit:
 	popl	%ebx
-	leal	BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx
+	leal	BORINGSSL_function_hit+0-.L038pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
@@ -2101,26 +2100,35 @@ aes_hw_cbc_encrypt:
 	popl	%ebp
 	ret
 .size	aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin
-.hidden	_aesni_set_encrypt_key
-.type	_aesni_set_encrypt_key,@function
+.globl	aes_hw_set_encrypt_key_base
+.hidden	aes_hw_set_encrypt_key_base
+.type	aes_hw_set_encrypt_key_base,@function
 .align	16
-_aesni_set_encrypt_key:
-	pushl	%ebp
+aes_hw_set_encrypt_key_base:
+.L_aes_hw_set_encrypt_key_base_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L093pic_for_function_hit
+.L093pic_for_function_hit:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+3-.L093pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
 	pushl	%ebx
-	testl	%eax,%eax
-	jz	.L093bad_pointer
-	testl	%edx,%edx
-	jz	.L093bad_pointer
 	call	.L094pic
 .L094pic:
 	popl	%ebx
 	leal	.Lkey_const-.L094pic(%ebx),%ebx
-	leal	OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp
 	movups	(%eax),%xmm0
 	xorps	%xmm4,%xmm4
-	movl	4(%ebp),%ebp
 	leal	16(%edx),%edx
-	andl	$268437504,%ebp
 	cmpl	$256,%ecx
 	je	.L09514rounds
 	cmpl	$192,%ecx
@@ -2129,38 +2137,36 @@ _aesni_set_encrypt_key:
 	jne	.L097bad_keybits
 .align	16
 .L09810rounds:
-	cmpl	$268435456,%ebp
-	je	.L09910rounds_alt
 	movl	$9,%ecx
 	movups	%xmm0,-16(%edx)
 .byte	102,15,58,223,200,1
-	call	.L100key_128_cold
+	call	.L099key_128_cold
 .byte	102,15,58,223,200,2
-	call	.L101key_128
+	call	.L100key_128
 .byte	102,15,58,223,200,4
-	call	.L101key_128
+	call	.L100key_128
 .byte	102,15,58,223,200,8
-	call	.L101key_128
+	call	.L100key_128
 .byte	102,15,58,223,200,16
-	call	.L101key_128
+	call	.L100key_128
 .byte	102,15,58,223,200,32
-	call	.L101key_128
+	call	.L100key_128
 .byte	102,15,58,223,200,64
-	call	.L101key_128
+	call	.L100key_128
 .byte	102,15,58,223,200,128
-	call	.L101key_128
+	call	.L100key_128
 .byte	102,15,58,223,200,27
-	call	.L101key_128
+	call	.L100key_128
 .byte	102,15,58,223,200,54
-	call	.L101key_128
+	call	.L100key_128
 	movups	%xmm0,(%edx)
 	movl	%ecx,80(%edx)
-	jmp	.L102good_key
+	jmp	.L101good_key
 .align	16
-.L101key_128:
+.L100key_128:
 	movups	%xmm0,(%edx)
 	leal	16(%edx),%edx
-.L100key_128_cold:
+.L099key_128_cold:
 	shufps	$16,%xmm0,%xmm4
 	xorps	%xmm4,%xmm0
 	shufps	$140,%xmm0,%xmm4
@@ -2169,91 +2175,37 @@ _aesni_set_encrypt_key:
 	xorps	%xmm1,%xmm0
 	ret
 .align	16
-.L09910rounds_alt:
-	movdqa	(%ebx),%xmm5
-	movl	$8,%ecx
-	movdqa	32(%ebx),%xmm4
-	movdqa	%xmm0,%xmm2
-	movdqu	%xmm0,-16(%edx)
-.L103loop_key128:
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	pslld	$1,%xmm4
-	leal	16(%edx),%edx
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,-16(%edx)
-	movdqa	%xmm0,%xmm2
-	decl	%ecx
-	jnz	.L103loop_key128
-	movdqa	48(%ebx),%xmm4
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	pslld	$1,%xmm4
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,(%edx)
-	movdqa	%xmm0,%xmm2
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,16(%edx)
-	movl	$9,%ecx
-	movl	%ecx,96(%edx)
-	jmp	.L102good_key
-.align	16
 .L09612rounds:
 	movq	16(%eax),%xmm2
-	cmpl	$268435456,%ebp
-	je	.L10412rounds_alt
 	movl	$11,%ecx
 	movups	%xmm0,-16(%edx)
 .byte	102,15,58,223,202,1
-	call	.L105key_192a_cold
+	call	.L102key_192a_cold
 .byte	102,15,58,223,202,2
-	call	.L106key_192b
+	call	.L103key_192b
 .byte	102,15,58,223,202,4
-	call	.L107key_192a
+	call	.L104key_192a
 .byte	102,15,58,223,202,8
-	call	.L106key_192b
+	call	.L103key_192b
 .byte	102,15,58,223,202,16
-	call	.L107key_192a
+	call	.L104key_192a
 .byte	102,15,58,223,202,32
-	call	.L106key_192b
+	call	.L103key_192b
 .byte	102,15,58,223,202,64
-	call	.L107key_192a
+	call	.L104key_192a
 .byte	102,15,58,223,202,128
-	call	.L106key_192b
+	call	.L103key_192b
 	movups	%xmm0,(%edx)
 	movl	%ecx,48(%edx)
-	jmp	.L102good_key
+	jmp	.L101good_key
 .align	16
-.L107key_192a:
+.L104key_192a:
 	movups	%xmm0,(%edx)
 	leal	16(%edx),%edx
 .align	16
-.L105key_192a_cold:
+.L102key_192a_cold:
 	movaps	%xmm2,%xmm5
-.L108key_192b_warm:
+.L105key_192b_warm:
 	shufps	$16,%xmm0,%xmm4
 	movdqa	%xmm2,%xmm3
 	xorps	%xmm4,%xmm0
@@ -2267,90 +2219,56 @@ _aesni_set_encrypt_key:
 	pxor	%xmm3,%xmm2
 	ret
 .align	16
-.L106key_192b:
+.L103key_192b:
 	movaps	%xmm0,%xmm3
 	shufps	$68,%xmm0,%xmm5
 	movups	%xmm5,(%edx)
 	shufps	$78,%xmm2,%xmm3
 	movups	%xmm3,16(%edx)
 	leal	32(%edx),%edx
-	jmp	.L108key_192b_warm
-.align	16
-.L10412rounds_alt:
-	movdqa	16(%ebx),%xmm5
-	movdqa	32(%ebx),%xmm4
-	movl	$8,%ecx
-	movdqu	%xmm0,-16(%edx)
-.L109loop_key192:
-	movq	%xmm2,(%edx)
-	movdqa	%xmm2,%xmm1
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
-	pslld	$1,%xmm4
-	leal	24(%edx),%edx
-	movdqa	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm3,%xmm0
-	pshufd	$255,%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-	pxor	%xmm2,%xmm0
-	pxor	%xmm3,%xmm2
-	movdqu	%xmm0,-16(%edx)
-	decl	%ecx
-	jnz	.L109loop_key192
-	movl	$11,%ecx
-	movl	%ecx,32(%edx)
-	jmp	.L102good_key
+	jmp	.L105key_192b_warm
 .align	16
 .L09514rounds:
 	movups	16(%eax),%xmm2
 	leal	16(%edx),%edx
-	cmpl	$268435456,%ebp
-	je	.L11014rounds_alt
 	movl	$13,%ecx
 	movups	%xmm0,-32(%edx)
 	movups	%xmm2,-16(%edx)
 .byte	102,15,58,223,202,1
-	call	.L111key_256a_cold
+	call	.L106key_256a_cold
 .byte	102,15,58,223,200,1
-	call	.L112key_256b
+	call	.L107key_256b
 .byte	102,15,58,223,202,2
-	call	.L113key_256a
+	call	.L108key_256a
 .byte	102,15,58,223,200,2
-	call	.L112key_256b
+	call	.L107key_256b
 .byte	102,15,58,223,202,4
-	call	.L113key_256a
+	call	.L108key_256a
 .byte	102,15,58,223,200,4
-	call	.L112key_256b
+	call	.L107key_256b
 .byte	102,15,58,223,202,8
-	call	.L113key_256a
+	call	.L108key_256a
 .byte	102,15,58,223,200,8
-	call	.L112key_256b
+	call	.L107key_256b
 .byte	102,15,58,223,202,16
-	call	.L113key_256a
+	call	.L108key_256a
 .byte	102,15,58,223,200,16
-	call	.L112key_256b
+	call	.L107key_256b
 .byte	102,15,58,223,202,32
-	call	.L113key_256a
+	call	.L108key_256a
 .byte	102,15,58,223,200,32
-	call	.L112key_256b
+	call	.L107key_256b
 .byte	102,15,58,223,202,64
-	call	.L113key_256a
+	call	.L108key_256a
 	movups	%xmm0,(%edx)
 	movl	%ecx,16(%edx)
 	xorl	%eax,%eax
-	jmp	.L102good_key
+	jmp	.L101good_key
 .align	16
-.L113key_256a:
+.L108key_256a:
 	movups	%xmm2,(%edx)
 	leal	16(%edx),%edx
-.L111key_256a_cold:
+.L106key_256a_cold:
 	shufps	$16,%xmm0,%xmm4
 	xorps	%xmm4,%xmm0
 	shufps	$140,%xmm0,%xmm4
@@ -2359,7 +2277,7 @@ _aesni_set_encrypt_key:
 	xorps	%xmm1,%xmm0
 	ret
 .align	16
-.L112key_256b:
+.L107key_256b:
 	movups	%xmm0,(%edx)
 	leal	16(%edx),%edx
 	shufps	$16,%xmm2,%xmm4
@@ -2369,15 +2287,154 @@ _aesni_set_encrypt_key:
 	shufps	$170,%xmm1,%xmm1
 	xorps	%xmm1,%xmm2
 	ret
+.L101good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	ret
+.align	4
+.L097bad_keybits:
+	pxor	%xmm0,%xmm0
+	movl	$-2,%eax
+	popl	%ebx
+	ret
+.size	aes_hw_set_encrypt_key_base,.-.L_aes_hw_set_encrypt_key_base_begin
+.globl	aes_hw_set_encrypt_key_alt
+.hidden	aes_hw_set_encrypt_key_alt
+.type	aes_hw_set_encrypt_key_alt,@function
+.align	16
+aes_hw_set_encrypt_key_alt:
+.L_aes_hw_set_encrypt_key_alt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L109pic_for_function_hit
+.L109pic_for_function_hit:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+3-.L109pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	pushl	%ebx
+	call	.L110pic
+.L110pic:
+	popl	%ebx
+	leal	.Lkey_const-.L110pic(%ebx),%ebx
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	leal	16(%edx),%edx
+	cmpl	$256,%ecx
+	je	.L11114rounds_alt
+	cmpl	$192,%ecx
+	je	.L11212rounds_alt
+	cmpl	$128,%ecx
+	jne	.L113bad_keybits
 .align	16
-.L11014rounds_alt:
+.L11410rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movl	$8,%ecx
+	movdqa	32(%ebx),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,-16(%edx)
+.L115loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leal	16(%edx),%edx
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%edx)
+	movdqa	%xmm0,%xmm2
+	decl	%ecx
+	jnz	.L115loop_key128
+	movdqa	48(%ebx),%xmm4
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%edx)
+	movl	$9,%ecx
+	movl	%ecx,96(%edx)
+	jmp	.L116good_key
+.align	16
+.L11212rounds_alt:
+	movq	16(%eax),%xmm2
+	movdqa	16(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$8,%ecx
+	movdqu	%xmm0,-16(%edx)
+.L117loop_key192:
+	movq	%xmm2,(%edx)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leal	24(%edx),%edx
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%edx)
+	decl	%ecx
+	jnz	.L117loop_key192
+	movl	$11,%ecx
+	movl	%ecx,32(%edx)
+	jmp	.L116good_key
+.align	16
+.L11114rounds_alt:
+	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
 	movdqa	(%ebx),%xmm5
 	movdqa	32(%ebx),%xmm4
 	movl	$7,%ecx
 	movdqu	%xmm0,-32(%edx)
 	movdqa	%xmm2,%xmm1
 	movdqu	%xmm2,-16(%edx)
-.L114loop_key256:
+.L118loop_key256:
 .byte	102,15,56,0,213
 .byte	102,15,56,221,212
 	movdqa	%xmm0,%xmm3
@@ -2391,7 +2448,7 @@ _aesni_set_encrypt_key:
 	pxor	%xmm2,%xmm0
 	movdqu	%xmm0,(%edx)
 	decl	%ecx
-	jz	.L115done_key256
+	jz	.L119done_key256
 	pshufd	$255,%xmm0,%xmm2
 	pxor	%xmm3,%xmm3
 .byte	102,15,56,221,211
@@ -2406,11 +2463,11 @@ _aesni_set_encrypt_key:
 	movdqu	%xmm2,16(%edx)
 	leal	32(%edx),%edx
 	movdqa	%xmm2,%xmm1
-	jmp	.L114loop_key256
-.L115done_key256:
+	jmp	.L118loop_key256
+.L119done_key256:
 	movl	$13,%ecx
 	movl	%ecx,16(%edx)
-.L102good_key:
+.L116good_key:
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	pxor	%xmm2,%xmm2
@@ -2419,60 +2476,23 @@ _aesni_set_encrypt_key:
 	pxor	%xmm5,%xmm5
 	xorl	%eax,%eax
 	popl	%ebx
-	popl	%ebp
-	ret
-.align	4
-.L093bad_pointer:
-	movl	$-1,%eax
-	popl	%ebx
-	popl	%ebp
 	ret
 .align	4
-.L097bad_keybits:
+.L113bad_keybits:
 	pxor	%xmm0,%xmm0
 	movl	$-2,%eax
 	popl	%ebx
-	popl	%ebp
 	ret
-.size	_aesni_set_encrypt_key,.-_aesni_set_encrypt_key
-.globl	aes_hw_set_encrypt_key
-.hidden	aes_hw_set_encrypt_key
-.type	aes_hw_set_encrypt_key,@function
-.align	16
-aes_hw_set_encrypt_key:
-.L_aes_hw_set_encrypt_key_begin:
-#ifdef BORINGSSL_DISPATCH_TEST
-	pushl	%ebx
-	pushl	%edx
-	call	.L116pic
-.L116pic:
-	popl	%ebx
-	leal	BORINGSSL_function_hit+3-.L116pic(%ebx),%ebx
-	movl	$1,%edx
-	movb	%dl,(%ebx)
-	popl	%edx
-	popl	%ebx
-#endif
-	movl	4(%esp),%eax
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	call	_aesni_set_encrypt_key
-	ret
-.size	aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin
-.globl	aes_hw_set_decrypt_key
-.hidden	aes_hw_set_decrypt_key
-.type	aes_hw_set_decrypt_key,@function
-.align	16
-aes_hw_set_decrypt_key:
-.L_aes_hw_set_decrypt_key_begin:
-	movl	4(%esp),%eax
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	call	_aesni_set_encrypt_key
-	movl	12(%esp),%edx
+.size	aes_hw_set_encrypt_key_alt,.-.L_aes_hw_set_encrypt_key_alt_begin
+.globl	aes_hw_encrypt_key_to_decrypt_key
+.hidden	aes_hw_encrypt_key_to_decrypt_key
+.type	aes_hw_encrypt_key_to_decrypt_key,@function
+.align	16
+aes_hw_encrypt_key_to_decrypt_key:
+.L_aes_hw_encrypt_key_to_decrypt_key_begin:
+	movl	4(%esp),%edx
+	movl	240(%edx),%ecx
 	shll	$4,%ecx
-	testl	%eax,%eax
-	jnz	.L117dec_key_ret
 	leal	16(%edx,%ecx,1),%eax
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
@@ -2480,7 +2500,7 @@ aes_hw_set_decrypt_key:
 	movups	%xmm1,(%edx)
 	leal	16(%edx),%edx
 	leal	-16(%eax),%eax
-.L118dec_key_inverse:
+.L120dec_key_inverse:
 	movups	(%edx),%xmm0
 	movups	(%eax),%xmm1
 .byte	102,15,56,219,192
@@ -2490,16 +2510,14 @@ aes_hw_set_decrypt_key:
 	movups	%xmm0,16(%eax)
 	movups	%xmm1,-16(%edx)
 	cmpl	%edx,%eax
-	ja	.L118dec_key_inverse
+	ja	.L120dec_key_inverse
 	movups	(%edx),%xmm0
 .byte	102,15,56,219,192
 	movups	%xmm0,(%edx)
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
-	xorl	%eax,%eax
-.L117dec_key_ret:
 	ret
-.size	aes_hw_set_decrypt_key,.-.L_aes_hw_set_decrypt_key_begin
+.size	aes_hw_encrypt_key_to_decrypt_key,.-.L_aes_hw_encrypt_key_to_decrypt_key_begin
 .align	64
 .Lkey_const:
 .long	202313229,202313229,202313229,202313229
@@ -2511,7 +2529,6 @@ aes_hw_set_decrypt_key:
 .byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
 .byte	115,108,46,111,114,103,62,0
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-apple.S
similarity index 97%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-apple.S
index dfe7d257..dc8e82d5 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -7,7 +6,6 @@
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
 .text	
-
 .globl	_aes_hw_encrypt
 .private_extern _aes_hw_encrypt
 
@@ -1907,76 +1905,63 @@ L$cbc_ret:
 	ret
 
 
-.globl	_aes_hw_set_decrypt_key
-.private_extern _aes_hw_set_decrypt_key
+.globl	_aes_hw_encrypt_key_to_decrypt_key
+.private_extern _aes_hw_encrypt_key_to_decrypt_key
 
 .p2align	4
-_aes_hw_set_decrypt_key:
+_aes_hw_encrypt_key_to_decrypt_key:
 
 _CET_ENDBR
-.byte	0x48,0x83,0xEC,0x08
 
-	call	__aesni_set_encrypt_key
+	movl	240(%rdi),%esi
 	shll	$4,%esi
-	testl	%eax,%eax
-	jnz	L$dec_key_ret
-	leaq	16(%rdx,%rsi,1),%rdi
 
-	movups	(%rdx),%xmm0
-	movups	(%rdi),%xmm1
-	movups	%xmm0,(%rdi)
-	movups	%xmm1,(%rdx)
-	leaq	16(%rdx),%rdx
-	leaq	-16(%rdi),%rdi
+	leaq	16(%rdi,%rsi,1),%rdx
+
+	movups	(%rdi),%xmm0
+	movups	(%rdx),%xmm1
+	movups	%xmm0,(%rdx)
+	movups	%xmm1,(%rdi)
+	leaq	16(%rdi),%rdi
+	leaq	-16(%rdx),%rdx
 
 L$dec_key_inverse:
-	movups	(%rdx),%xmm0
-	movups	(%rdi),%xmm1
+	movups	(%rdi),%xmm0
+	movups	(%rdx),%xmm1
 .byte	102,15,56,219,192
 .byte	102,15,56,219,201
-	leaq	16(%rdx),%rdx
-	leaq	-16(%rdi),%rdi
-	movups	%xmm0,16(%rdi)
-	movups	%xmm1,-16(%rdx)
-	cmpq	%rdx,%rdi
+	leaq	16(%rdi),%rdi
+	leaq	-16(%rdx),%rdx
+	movups	%xmm0,16(%rdx)
+	movups	%xmm1,-16(%rdi)
+	cmpq	%rdi,%rdx
 	ja	L$dec_key_inverse
 
-	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm0
 .byte	102,15,56,219,192
 	pxor	%xmm1,%xmm1
-	movups	%xmm0,(%rdi)
+	movups	%xmm0,(%rdx)
 	pxor	%xmm0,%xmm0
-L$dec_key_ret:
-	addq	$8,%rsp
-
 	ret
 
-L$SEH_end_set_decrypt_key:
 
-.globl	_aes_hw_set_encrypt_key
-.private_extern _aes_hw_set_encrypt_key
+.globl	_aes_hw_set_encrypt_key_base
+.private_extern _aes_hw_set_encrypt_key_base
 
 .p2align	4
-_aes_hw_set_encrypt_key:
-__aesni_set_encrypt_key:
+_aes_hw_set_encrypt_key_base:
+
 
 _CET_ENDBR
 #ifdef BORINGSSL_DISPATCH_TEST
 	movb	$1,_BORINGSSL_function_hit+3(%rip)
 #endif
-.byte	0x48,0x83,0xEC,0x08
+	subq	$8,%rsp
+
 
-	movq	$-1,%rax
-	testq	%rdi,%rdi
-	jz	L$enc_key_ret
-	testq	%rdx,%rdx
-	jz	L$enc_key_ret
 
 	movups	(%rdi),%xmm0
 	xorps	%xmm4,%xmm4
-	leaq	_OPENSSL_ia32cap_P(%rip),%r10
-	movl	4(%r10),%r10d
-	andl	$268437504,%r10d
 	leaq	16(%rdx),%rax
 	cmpl	$256,%esi
 	je	L$14rounds
@@ -1987,8 +1972,6 @@ _CET_ENDBR
 
 L$10rounds:
 	movl	$9,%esi
-	cmpl	$268435456,%r10d
-	je	L$10rounds_alt
 
 	movups	%xmm0,(%rdx)
 .byte	102,15,58,223,200,1
@@ -2017,7 +2000,193 @@ L$10rounds:
 	jmp	L$enc_key_ret
 
 .p2align	4
-L$10rounds_alt:
+L$12rounds:
+	movq	16(%rdi),%xmm2
+	movl	$11,%esi
+
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,202,1
+	call	L$key_expansion_192a_cold
+.byte	102,15,58,223,202,2
+	call	L$key_expansion_192b
+.byte	102,15,58,223,202,4
+	call	L$key_expansion_192a
+.byte	102,15,58,223,202,8
+	call	L$key_expansion_192b
+.byte	102,15,58,223,202,16
+	call	L$key_expansion_192a
+.byte	102,15,58,223,202,32
+	call	L$key_expansion_192b
+.byte	102,15,58,223,202,64
+	call	L$key_expansion_192a
+.byte	102,15,58,223,202,128
+	call	L$key_expansion_192b
+	movups	%xmm0,(%rax)
+	movl	%esi,48(%rax)
+	xorq	%rax,%rax
+	jmp	L$enc_key_ret
+
+.p2align	4
+L$14rounds:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+
+	movups	%xmm0,(%rdx)
+	movups	%xmm2,16(%rdx)
+.byte	102,15,58,223,202,1
+	call	L$key_expansion_256a_cold
+.byte	102,15,58,223,200,1
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,2
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,2
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,4
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,4
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,8
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,8
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,16
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,16
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,32
+	call	L$key_expansion_256a
+.byte	102,15,58,223,200,32
+	call	L$key_expansion_256b
+.byte	102,15,58,223,202,64
+	call	L$key_expansion_256a
+	movups	%xmm0,(%rax)
+	movl	%esi,16(%rax)
+	xorq	%rax,%rax
+	jmp	L$enc_key_ret
+
+.p2align	4
+L$bad_keybits:
+	movq	$-2,%rax
+L$enc_key_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+
+	ret
+
+
+
+.p2align	4
+L$key_expansion_128:
+
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+L$key_expansion_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+
+
+.p2align	4
+L$key_expansion_192a:
+
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+L$key_expansion_192a_cold:
+	movaps	%xmm2,%xmm5
+L$key_expansion_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+
+
+.p2align	4
+L$key_expansion_192b:
+
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%rax)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%rax)
+	leaq	32(%rax),%rax
+	jmp	L$key_expansion_192b_warm
+
+
+.p2align	4
+L$key_expansion_256a:
+
+	movups	%xmm2,(%rax)
+	leaq	16(%rax),%rax
+L$key_expansion_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+
+
+.p2align	4
+L$key_expansion_256b:
+
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+
+
+
+.globl	_aes_hw_set_encrypt_key_alt
+.private_extern _aes_hw_set_encrypt_key_alt
+
+.p2align	4
+_aes_hw_set_encrypt_key_alt:
+
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,_BORINGSSL_function_hit+3(%rip)
+#endif
+	subq	$8,%rsp
+
+
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	L$14rounds_alt
+	cmpl	$192,%esi
+	je	L$12rounds_alt
+	cmpl	$128,%esi
+	jne	L$bad_keybits_alt
+
+	movl	$9,%esi
 	movdqa	L$key_rotate(%rip),%xmm5
 	movl	$8,%r10d
 	movdqa	L$key_rcon1(%rip),%xmm4
@@ -2081,39 +2250,12 @@ L$oop_key128:
 
 	movl	%esi,96(%rax)
 	xorl	%eax,%eax
-	jmp	L$enc_key_ret
+	jmp	L$enc_key_ret_alt
 
 .p2align	4
-L$12rounds:
+L$12rounds_alt:
 	movq	16(%rdi),%xmm2
 	movl	$11,%esi
-	cmpl	$268435456,%r10d
-	je	L$12rounds_alt
-
-	movups	%xmm0,(%rdx)
-.byte	102,15,58,223,202,1
-	call	L$key_expansion_192a_cold
-.byte	102,15,58,223,202,2
-	call	L$key_expansion_192b
-.byte	102,15,58,223,202,4
-	call	L$key_expansion_192a
-.byte	102,15,58,223,202,8
-	call	L$key_expansion_192b
-.byte	102,15,58,223,202,16
-	call	L$key_expansion_192a
-.byte	102,15,58,223,202,32
-	call	L$key_expansion_192b
-.byte	102,15,58,223,202,64
-	call	L$key_expansion_192a
-.byte	102,15,58,223,202,128
-	call	L$key_expansion_192b
-	movups	%xmm0,(%rax)
-	movl	%esi,48(%rax)
-	xorq	%rax,%rax
-	jmp	L$enc_key_ret
-
-.p2align	4
-L$12rounds_alt:
 	movdqa	L$key_rotate192(%rip),%xmm5
 	movdqa	L$key_rcon1(%rip),%xmm4
 	movl	$8,%r10d
@@ -2151,51 +2293,13 @@ L$oop_key192:
 
 	movl	%esi,32(%rax)
 	xorl	%eax,%eax
-	jmp	L$enc_key_ret
+	jmp	L$enc_key_ret_alt
 
 .p2align	4
-L$14rounds:
+L$14rounds_alt:
 	movups	16(%rdi),%xmm2
 	movl	$13,%esi
 	leaq	16(%rax),%rax
-	cmpl	$268435456,%r10d
-	je	L$14rounds_alt
-
-	movups	%xmm0,(%rdx)
-	movups	%xmm2,16(%rdx)
-.byte	102,15,58,223,202,1
-	call	L$key_expansion_256a_cold
-.byte	102,15,58,223,200,1
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,2
-	call	L$key_expansion_256a
-.byte	102,15,58,223,200,2
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,4
-	call	L$key_expansion_256a
-.byte	102,15,58,223,200,4
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,8
-	call	L$key_expansion_256a
-.byte	102,15,58,223,200,8
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,16
-	call	L$key_expansion_256a
-.byte	102,15,58,223,200,16
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,32
-	call	L$key_expansion_256a
-.byte	102,15,58,223,200,32
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,64
-	call	L$key_expansion_256a
-	movups	%xmm0,(%rax)
-	movl	%esi,16(%rax)
-	xorq	%rax,%rax
-	jmp	L$enc_key_ret
-
-.p2align	4
-L$14rounds_alt:
 	movdqa	L$key_rotate(%rip),%xmm5
 	movdqa	L$key_rcon1(%rip),%xmm4
 	movl	$7,%r10d
@@ -2246,12 +2350,12 @@ L$oop_key256:
 L$done_key256:
 	movl	%esi,16(%rax)
 	xorl	%eax,%eax
-	jmp	L$enc_key_ret
+	jmp	L$enc_key_ret_alt
 
 .p2align	4
-L$bad_keybits:
+L$bad_keybits_alt:
 	movq	$-2,%rax
-L$enc_key_ret:
+L$enc_key_ret_alt:
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	pxor	%xmm2,%xmm2
@@ -2262,76 +2366,6 @@ L$enc_key_ret:
 
 	ret
 
-L$SEH_end_set_encrypt_key:
-
-.p2align	4
-L$key_expansion_128:
-	movups	%xmm0,(%rax)
-	leaq	16(%rax),%rax
-L$key_expansion_128_cold:
-	shufps	$16,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$255,%xmm1,%xmm1
-	xorps	%xmm1,%xmm0
-	ret
-
-.p2align	4
-L$key_expansion_192a:
-	movups	%xmm0,(%rax)
-	leaq	16(%rax),%rax
-L$key_expansion_192a_cold:
-	movaps	%xmm2,%xmm5
-L$key_expansion_192b_warm:
-	shufps	$16,%xmm0,%xmm4
-	movdqa	%xmm2,%xmm3
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	pslldq	$4,%xmm3
-	xorps	%xmm4,%xmm0
-	pshufd	$85,%xmm1,%xmm1
-	pxor	%xmm3,%xmm2
-	pxor	%xmm1,%xmm0
-	pshufd	$255,%xmm0,%xmm3
-	pxor	%xmm3,%xmm2
-	ret
-
-.p2align	4
-L$key_expansion_192b:
-	movaps	%xmm0,%xmm3
-	shufps	$68,%xmm0,%xmm5
-	movups	%xmm5,(%rax)
-	shufps	$78,%xmm2,%xmm3
-	movups	%xmm3,16(%rax)
-	leaq	32(%rax),%rax
-	jmp	L$key_expansion_192b_warm
-
-.p2align	4
-L$key_expansion_256a:
-	movups	%xmm2,(%rax)
-	leaq	16(%rax),%rax
-L$key_expansion_256a_cold:
-	shufps	$16,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$255,%xmm1,%xmm1
-	xorps	%xmm1,%xmm0
-	ret
-
-.p2align	4
-L$key_expansion_256b:
-	movups	%xmm0,(%rax)
-	leaq	16(%rax),%rax
-
-	shufps	$16,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$140,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$170,%xmm1,%xmm1
-	xorps	%xmm1,%xmm2
-	ret
 
 
 .section	__DATA,__const
@@ -2359,7 +2393,6 @@ L$key_rcon1b:
 .p2align	6
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-linux.S
similarity index 96%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-linux.S
index e86b7b84..eb34d936 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesni-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesni-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -7,8 +6,6 @@
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
 .text	
-.extern	OPENSSL_ia32cap_P
-.hidden OPENSSL_ia32cap_P
 .globl	aes_hw_encrypt
 .hidden aes_hw_encrypt
 .type	aes_hw_encrypt,@function
@@ -1909,76 +1906,63 @@ _CET_ENDBR
 	ret
 .cfi_endproc	
 .size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
-.globl	aes_hw_set_decrypt_key
-.hidden aes_hw_set_decrypt_key
-.type	aes_hw_set_decrypt_key,@function
+.globl	aes_hw_encrypt_key_to_decrypt_key
+.hidden aes_hw_encrypt_key_to_decrypt_key
+.type	aes_hw_encrypt_key_to_decrypt_key,@function
 .align	16
-aes_hw_set_decrypt_key:
+aes_hw_encrypt_key_to_decrypt_key:
 .cfi_startproc	
 _CET_ENDBR
-.byte	0x48,0x83,0xEC,0x08
-.cfi_adjust_cfa_offset	8
-	call	__aesni_set_encrypt_key
+
+	movl	240(%rdi),%esi
 	shll	$4,%esi
-	testl	%eax,%eax
-	jnz	.Ldec_key_ret
-	leaq	16(%rdx,%rsi,1),%rdi
 
-	movups	(%rdx),%xmm0
-	movups	(%rdi),%xmm1
-	movups	%xmm0,(%rdi)
-	movups	%xmm1,(%rdx)
-	leaq	16(%rdx),%rdx
-	leaq	-16(%rdi),%rdi
+	leaq	16(%rdi,%rsi,1),%rdx
+
+	movups	(%rdi),%xmm0
+	movups	(%rdx),%xmm1
+	movups	%xmm0,(%rdx)
+	movups	%xmm1,(%rdi)
+	leaq	16(%rdi),%rdi
+	leaq	-16(%rdx),%rdx
 
 .Ldec_key_inverse:
-	movups	(%rdx),%xmm0
-	movups	(%rdi),%xmm1
+	movups	(%rdi),%xmm0
+	movups	(%rdx),%xmm1
 .byte	102,15,56,219,192
 .byte	102,15,56,219,201
-	leaq	16(%rdx),%rdx
-	leaq	-16(%rdi),%rdi
-	movups	%xmm0,16(%rdi)
-	movups	%xmm1,-16(%rdx)
-	cmpq	%rdx,%rdi
+	leaq	16(%rdi),%rdi
+	leaq	-16(%rdx),%rdx
+	movups	%xmm0,16(%rdx)
+	movups	%xmm1,-16(%rdi)
+	cmpq	%rdi,%rdx
 	ja	.Ldec_key_inverse
 
-	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm0
 .byte	102,15,56,219,192
 	pxor	%xmm1,%xmm1
-	movups	%xmm0,(%rdi)
+	movups	%xmm0,(%rdx)
 	pxor	%xmm0,%xmm0
-.Ldec_key_ret:
-	addq	$8,%rsp
-.cfi_adjust_cfa_offset	-8
 	ret
 .cfi_endproc	
-.LSEH_end_set_decrypt_key:
-.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
-.globl	aes_hw_set_encrypt_key
-.hidden aes_hw_set_encrypt_key
-.type	aes_hw_set_encrypt_key,@function
-.align	16
-aes_hw_set_encrypt_key:
-__aesni_set_encrypt_key:
+.size	aes_hw_encrypt_key_to_decrypt_key,.-aes_hw_encrypt_key_to_decrypt_key
+.globl	aes_hw_set_encrypt_key_base
+.hidden aes_hw_set_encrypt_key_base
+.type	aes_hw_set_encrypt_key_base,@function
+.align	16
+aes_hw_set_encrypt_key_base:
 .cfi_startproc	
+
 _CET_ENDBR
 #ifdef BORINGSSL_DISPATCH_TEST
 	movb	$1,BORINGSSL_function_hit+3(%rip)
 #endif
-.byte	0x48,0x83,0xEC,0x08
+	subq	$8,%rsp
 .cfi_adjust_cfa_offset	8
-	movq	$-1,%rax
-	testq	%rdi,%rdi
-	jz	.Lenc_key_ret
-	testq	%rdx,%rdx
-	jz	.Lenc_key_ret
+
 
 	movups	(%rdi),%xmm0
 	xorps	%xmm4,%xmm4
-	leaq	OPENSSL_ia32cap_P(%rip),%r10
-	movl	4(%r10),%r10d
-	andl	$268437504,%r10d
 	leaq	16(%rdx),%rax
 	cmpl	$256,%esi
 	je	.L14rounds
@@ -1989,8 +1973,6 @@ _CET_ENDBR
 
 .L10rounds:
 	movl	$9,%esi
-	cmpl	$268435456,%r10d
-	je	.L10rounds_alt
 
 	movups	%xmm0,(%rdx)
 .byte	102,15,58,223,200,1
@@ -2019,7 +2001,193 @@ _CET_ENDBR
 	jmp	.Lenc_key_ret
 
 .align	16
-.L10rounds_alt:
+.L12rounds:
+	movq	16(%rdi),%xmm2
+	movl	$11,%esi
+
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_192a_cold
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,128
+	call	.Lkey_expansion_192b
+	movups	%xmm0,(%rax)
+	movl	%esi,48(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+
+	movups	%xmm0,(%rdx)
+	movups	%xmm2,16(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_256a_cold
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_256a
+	movups	%xmm0,(%rax)
+	movl	%esi,16(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	movq	$-2,%rax
+.Lenc_key_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	ret
+.cfi_endproc	
+
+
+.align	16
+.Lkey_expansion_128:
+.cfi_startproc	
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.cfi_endproc	
+
+.align	16
+.Lkey_expansion_192a:
+.cfi_startproc	
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_192a_cold:
+	movaps	%xmm2,%xmm5
+.Lkey_expansion_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+.cfi_endproc	
+
+.align	16
+.Lkey_expansion_192b:
+.cfi_startproc	
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%rax)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%rax)
+	leaq	32(%rax),%rax
+	jmp	.Lkey_expansion_192b_warm
+.cfi_endproc	
+
+.align	16
+.Lkey_expansion_256a:
+.cfi_startproc	
+	movups	%xmm2,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.cfi_endproc	
+
+.align	16
+.Lkey_expansion_256b:
+.cfi_startproc	
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+.cfi_endproc	
+.size	aes_hw_set_encrypt_key_base,.-aes_hw_set_encrypt_key_base
+
+.globl	aes_hw_set_encrypt_key_alt
+.hidden aes_hw_set_encrypt_key_alt
+.type	aes_hw_set_encrypt_key_alt,@function
+.align	16
+aes_hw_set_encrypt_key_alt:
+.cfi_startproc	
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit+3(%rip)
+#endif
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	.L14rounds_alt
+	cmpl	$192,%esi
+	je	.L12rounds_alt
+	cmpl	$128,%esi
+	jne	.Lbad_keybits_alt
+
+	movl	$9,%esi
 	movdqa	.Lkey_rotate(%rip),%xmm5
 	movl	$8,%r10d
 	movdqa	.Lkey_rcon1(%rip),%xmm4
@@ -2083,39 +2251,12 @@ _CET_ENDBR
 
 	movl	%esi,96(%rax)
 	xorl	%eax,%eax
-	jmp	.Lenc_key_ret
+	jmp	.Lenc_key_ret_alt
 
 .align	16
-.L12rounds:
+.L12rounds_alt:
 	movq	16(%rdi),%xmm2
 	movl	$11,%esi
-	cmpl	$268435456,%r10d
-	je	.L12rounds_alt
-
-	movups	%xmm0,(%rdx)
-.byte	102,15,58,223,202,1
-	call	.Lkey_expansion_192a_cold
-.byte	102,15,58,223,202,2
-	call	.Lkey_expansion_192b
-.byte	102,15,58,223,202,4
-	call	.Lkey_expansion_192a
-.byte	102,15,58,223,202,8
-	call	.Lkey_expansion_192b
-.byte	102,15,58,223,202,16
-	call	.Lkey_expansion_192a
-.byte	102,15,58,223,202,32
-	call	.Lkey_expansion_192b
-.byte	102,15,58,223,202,64
-	call	.Lkey_expansion_192a
-.byte	102,15,58,223,202,128
-	call	.Lkey_expansion_192b
-	movups	%xmm0,(%rax)
-	movl	%esi,48(%rax)
-	xorq	%rax,%rax
-	jmp	.Lenc_key_ret
-
-.align	16
-.L12rounds_alt:
 	movdqa	.Lkey_rotate192(%rip),%xmm5
 	movdqa	.Lkey_rcon1(%rip),%xmm4
 	movl	$8,%r10d
@@ -2153,51 +2294,13 @@ _CET_ENDBR
 
 	movl	%esi,32(%rax)
 	xorl	%eax,%eax
-	jmp	.Lenc_key_ret
+	jmp	.Lenc_key_ret_alt
 
 .align	16
-.L14rounds:
+.L14rounds_alt:
 	movups	16(%rdi),%xmm2
 	movl	$13,%esi
 	leaq	16(%rax),%rax
-	cmpl	$268435456,%r10d
-	je	.L14rounds_alt
-
-	movups	%xmm0,(%rdx)
-	movups	%xmm2,16(%rdx)
-.byte	102,15,58,223,202,1
-	call	.Lkey_expansion_256a_cold
-.byte	102,15,58,223,200,1
-	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,2
-	call	.Lkey_expansion_256a
-.byte	102,15,58,223,200,2
-	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,4
-	call	.Lkey_expansion_256a
-.byte	102,15,58,223,200,4
-	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,8
-	call	.Lkey_expansion_256a
-.byte	102,15,58,223,200,8
-	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,16
-	call	.Lkey_expansion_256a
-.byte	102,15,58,223,200,16
-	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,32
-	call	.Lkey_expansion_256a
-.byte	102,15,58,223,200,32
-	call	.Lkey_expansion_256b
-.byte	102,15,58,223,202,64
-	call	.Lkey_expansion_256a
-	movups	%xmm0,(%rax)
-	movl	%esi,16(%rax)
-	xorq	%rax,%rax
-	jmp	.Lenc_key_ret
-
-.align	16
-.L14rounds_alt:
 	movdqa	.Lkey_rotate(%rip),%xmm5
 	movdqa	.Lkey_rcon1(%rip),%xmm4
 	movl	$7,%r10d
@@ -2248,12 +2351,12 @@ _CET_ENDBR
 .Ldone_key256:
 	movl	%esi,16(%rax)
 	xorl	%eax,%eax
-	jmp	.Lenc_key_ret
+	jmp	.Lenc_key_ret_alt
 
 .align	16
-.Lbad_keybits:
+.Lbad_keybits_alt:
 	movq	$-2,%rax
-.Lenc_key_ret:
+.Lenc_key_ret_alt:
 	pxor	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
 	pxor	%xmm2,%xmm2
@@ -2264,78 +2367,8 @@ _CET_ENDBR
 .cfi_adjust_cfa_offset	-8
 	ret
 .cfi_endproc	
-.LSEH_end_set_encrypt_key:
-
-.align	16
-.Lkey_expansion_128:
-	movups	%xmm0,(%rax)
-	leaq	16(%rax),%rax
-.Lkey_expansion_128_cold:
-	shufps	$16,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$255,%xmm1,%xmm1
-	xorps	%xmm1,%xmm0
-	ret
-
-.align	16
-.Lkey_expansion_192a:
-	movups	%xmm0,(%rax)
-	leaq	16(%rax),%rax
-.Lkey_expansion_192a_cold:
-	movaps	%xmm2,%xmm5
-.Lkey_expansion_192b_warm:
-	shufps	$16,%xmm0,%xmm4
-	movdqa	%xmm2,%xmm3
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	pslldq	$4,%xmm3
-	xorps	%xmm4,%xmm0
-	pshufd	$85,%xmm1,%xmm1
-	pxor	%xmm3,%xmm2
-	pxor	%xmm1,%xmm0
-	pshufd	$255,%xmm0,%xmm3
-	pxor	%xmm3,%xmm2
-	ret
-
-.align	16
-.Lkey_expansion_192b:
-	movaps	%xmm0,%xmm3
-	shufps	$68,%xmm0,%xmm5
-	movups	%xmm5,(%rax)
-	shufps	$78,%xmm2,%xmm3
-	movups	%xmm3,16(%rax)
-	leaq	32(%rax),%rax
-	jmp	.Lkey_expansion_192b_warm
 
-.align	16
-.Lkey_expansion_256a:
-	movups	%xmm2,(%rax)
-	leaq	16(%rax),%rax
-.Lkey_expansion_256a_cold:
-	shufps	$16,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$255,%xmm1,%xmm1
-	xorps	%xmm1,%xmm0
-	ret
-
-.align	16
-.Lkey_expansion_256b:
-	movups	%xmm0,(%rax)
-	leaq	16(%rax),%rax
-
-	shufps	$16,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$140,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$170,%xmm1,%xmm1
-	xorps	%xmm1,%xmm2
-	ret
-.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
-.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+.size	aes_hw_set_encrypt_key_alt,.-aes_hw_set_encrypt_key_alt
 .section	.rodata
 .align	64
 .Lbswap_mask:
@@ -2361,7 +2394,6 @@ _CET_ENDBR
 .align	64
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv7-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-linux.linux.arm.S
rename to Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv7-linux.S
index f38e85e4..7b3aa3c0 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv7-linux.linux.arm.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv7-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -28,11 +27,6 @@
 .align	5
 aes_hw_set_encrypt_key:
 .Lenc_key:
-	mov	r3,#-1
-	cmp	r0,#0
-	beq	.Lenc_key_abort
-	cmp	r2,#0
-	beq	.Lenc_key_abort
 	mov	r3,#-2
 	cmp	r1,#128
 	blt	.Lenc_key_abort
@@ -789,7 +783,6 @@ aes_hw_ctr32_encrypt_blocks:
 .size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
-#endif  // defined(__arm__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-apple.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-apple.S
index 71e56359..8f9bf08f 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -30,11 +29,6 @@ Lenc_key:
 	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
-	mov	x3,#-1
-	cmp	x0,#0
-	b.eq	Lenc_key_abort
-	cmp	x2,#0
-	b.eq	Lenc_key_abort
 	mov	x3,#-2
 	cmp	w1,#128
 	b.lt	Lenc_key_abort
@@ -791,7 +785,6 @@ Lctr32_done:
 
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-linux.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-linux.S
index 7f98e4bc..a79876a1 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -30,11 +29,6 @@ aes_hw_set_encrypt_key:
 	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
-	mov	x3,#-1
-	cmp	x0,#0
-	b.eq	.Lenc_key_abort
-	cmp	x2,#0
-	b.eq	.Lenc_key_abort
 	mov	x3,#-2
 	cmp	w1,#128
 	b.lt	.Lenc_key_abort
@@ -791,7 +785,6 @@ aes_hw_ctr32_encrypt_blocks:
 .size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-win.S
new file mode 100644
index 00000000..eda354ee
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-armv8-win.S
@@ -0,0 +1,803 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <CCryptoBoringSSL_arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.section	.rodata
+.align	5
+Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	aes_hw_set_encrypt_key
+
+.def aes_hw_set_encrypt_key
+   .type 32
+.endef
+.align	5
+aes_hw_set_encrypt_key:
+Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	Lenc_key_abort
+	cmp	w1,#256
+	b.gt	Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	Lenc_key_abort
+
+	adrp	x3,Lrcon
+	add	x3,x3,:lo12:Lrcon
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	Loop128
+	b.eq	L192
+	b	L256
+
+.align	4
+Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	Ldone
+
+.align	4
+L192:
+	ld1	{v4.8b},[x0],#8
+	movi	v6.16b,#8			// borrow v6.16b
+	st1	{v3.4s},[x2],#16
+	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
+
+Loop192:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.8b},[x2],#8
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+
+	dup	v5.4s,v3.s[3]
+	eor	v5.16b,v5.16b,v4.16b
+	eor	v6.16b,v6.16b,v1.16b
+	ext	v4.16b,v0.16b,v4.16b,#12
+	shl	v1.16b,v1.16b,#1
+	eor	v4.16b,v4.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	eor	v4.16b,v4.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.ne	Loop192
+
+	mov	w12,#12
+	add	x2,x2,#0x20
+	b	Ldone
+
+.align	4
+L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	Loop256
+
+Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+
+
+.globl	aes_hw_set_decrypt_key
+
+.def aes_hw_set_decrypt_key
+   .type 32
+.endef
+.align	5
+aes_hw_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	bl	Lenc_key
+
+	cmp	x0,#0
+	b.ne	Ldec_key_abort
+
+	sub	x2,x2,#240		// restore original x2
+	mov	x4,#-16
+	add	x0,x2,x12,lsl#4	// end of key schedule
+
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+
+Loop_imc:
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+	cmp	x0,x2
+	b.hi	Loop_imc
+
+	ld1	{v0.4s},[x2]
+	aesimc	v0.16b,v0.16b
+	st1	{v0.4s},[x0]
+
+	eor	x0,x0,x0		// return value
+Ldec_key_abort:
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	aes_hw_encrypt
+
+.def aes_hw_encrypt
+   .type 32
+.endef
+.align	5
+aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_enc:
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aese	v2.16b,v1.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_enc
+
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aese	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+
+.globl	aes_hw_decrypt
+
+.def aes_hw_decrypt
+   .type 32
+.endef
+.align	5
+aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_dec:
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aesd	v2.16b,v1.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_dec
+
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aesd	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+
+.globl	aes_hw_cbc_encrypt
+
+.def aes_hw_cbc_encrypt
+   .type 32
+.endef
+.align	5
+aes_hw_cbc_encrypt:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	subs	x2,x2,#16
+	mov	x8,#16
+	b.lo	Lcbc_abort
+	csel	x8,xzr,x8,eq
+
+	cmp	w5,#0			// en- or decrypting?
+	ldr	w5,[x3,#240]
+	and	x2,x2,#-16
+	ld1	{v6.16b},[x4]
+	ld1	{v0.16b},[x0],x8
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#6
+	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
+	sub	w5,w5,#2
+	ld1	{v18.4s,v19.4s},[x7],#32
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+
+	add	x7,x3,#32
+	mov	w6,w5
+	b.eq	Lcbc_dec
+
+	cmp	w5,#2
+	eor	v0.16b,v0.16b,v6.16b
+	eor	v5.16b,v16.16b,v7.16b
+	b.eq	Lcbc_enc128
+
+	ld1	{v2.4s,v3.4s},[x7]
+	add	x7,x3,#16
+	add	x6,x3,#16*4
+	add	x12,x3,#16*5
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	add	x14,x3,#16*6
+	add	x3,x3,#16*7
+	b	Lenter_cbc_enc
+
+.align	4
+Loop_cbc_enc:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x6]
+	cmp	w5,#4
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x12]
+	b.eq	Lcbc_enc192
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x14]
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x3]
+	nop
+
+Lcbc_enc192:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+
+.align	5
+Lcbc_enc128:
+	ld1	{v2.4s,v3.4s},[x7]
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	b	Lenter_cbc_enc128
+Loop_cbc_enc128:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc128:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc128
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+.align	5
+Lcbc_dec:
+	ld1	{v18.16b},[x0],#16
+	subs	x2,x2,#32		// bias
+	add	w6,w5,#2
+	orr	v3.16b,v0.16b,v0.16b
+	orr	v1.16b,v0.16b,v0.16b
+	orr	v19.16b,v18.16b,v18.16b
+	b.lo	Lcbc_dec_tail
+
+	orr	v1.16b,v18.16b,v18.16b
+	ld1	{v18.16b},[x0],#16
+	orr	v2.16b,v0.16b,v0.16b
+	orr	v3.16b,v1.16b,v1.16b
+	orr	v19.16b,v18.16b,v18.16b
+
+Loop3x_cbc_dec:
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_cbc_dec
+
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	eor	v4.16b,v6.16b,v7.16b
+	subs	x2,x2,#0x30
+	eor	v5.16b,v2.16b,v7.16b
+	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	add	x0,x0,x6		// x0 is adjusted in such way that
+					// at exit from the loop v1.16b-v18.16b
+					// are loaded with last "words"
+	orr	v6.16b,v19.16b,v19.16b
+	mov	x7,x3
+	aesd	v0.16b,v20.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v2.16b},[x0],#16
+	aesd	v0.16b,v21.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	aesd	v0.16b,v22.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v19.16b},[x0],#16
+	aesd	v0.16b,v23.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	add	w6,w5,#2
+	eor	v4.16b,v4.16b,v0.16b
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v18.16b,v18.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v4.16b},[x1],#16
+	orr	v0.16b,v2.16b,v2.16b
+	st1	{v5.16b},[x1],#16
+	orr	v1.16b,v3.16b,v3.16b
+	st1	{v18.16b},[x1],#16
+	orr	v18.16b,v19.16b,v19.16b
+	b.hs	Loop3x_cbc_dec
+
+	cmn	x2,#0x30
+	b.eq	Lcbc_done
+	nop
+
+Lcbc_dec_tail:
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lcbc_dec_tail
+
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	cmn	x2,#0x20
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	eor	v5.16b,v6.16b,v7.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	b.eq	Lcbc_dec_one
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v17.16b,v17.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+	st1	{v17.16b},[x1],#16
+	b	Lcbc_done
+
+Lcbc_dec_one:
+	eor	v5.16b,v5.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+
+Lcbc_done:
+	st1	{v6.16b},[x4]
+Lcbc_abort:
+	ldr	x29,[sp],#16
+	ret
+
+.globl	aes_hw_ctr32_encrypt_blocks
+
+.def aes_hw_ctr32_encrypt_blocks
+   .type 32
+.endef
+.align	5
+aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+	rev	w8, w8
+#endif
+	add	w10, w8, #1
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
+	b.ls	Lctr32_tail
+	rev	w12, w8
+	mov	v6.s[3],w12
+	sub	x2,x2,#3		// bias
+	orr	v18.16b,v6.16b,v6.16b
+	b	Loop3x_ctr32
+
+.align	4
+Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	add	w9,w8,#1
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	rev	w9,w9
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	eor	v19.16b,v19.16b,v7.16b
+	mov	v6.s[3], w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	orr	v0.16b,v6.16b,v6.16b
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	orr	v18.16b,v6.16b,v6.16b
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	Lctr32_done
+	cmp	x2,#1
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+
+Lctr32_tail:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	cmp	x2,#1
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	b.eq	Lctr32_done
+	st1	{v3.16b},[x1]
+
+Lctr32_done:
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-apple.S
index e6a85f0c..5202753a 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1555,7 +1554,6 @@ Ldec_blocks_less_than_1:	//	blocks left <= 1
 
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-linux.S
index 8ff6cd7f..a53c6074 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/aesv8-gcm-armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1555,7 +1554,6 @@ aes_gcm_dec_kernel:
 .size	aes_gcm_dec_kernel,.-aes_gcm_dec_kernel
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-win.S
new file mode 100644
index 00000000..cad80f35
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/aesv8-gcm-armv8-win.S
@@ -0,0 +1,1564 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <CCryptoBoringSSL_arm_arch.h>
+#if __ARM_MAX_ARCH__ >= 8
+
+.arch	armv8-a+crypto
+.text
+.globl	aes_gcm_enc_kernel
+
+.def aes_gcm_enc_kernel
+   .type 32
+.endef
+.align	4
+aes_gcm_enc_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q18, [x8, #0]                                  // load rk0
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	ldr	q25, [x8, #112]                                // load rk7
+	add	x5, x5, x0
+	lsr	x12, x11, #32
+	fmov	d2, x10                               // CTR block 2
+	orr	w11, w11, w11
+	rev	w12, w12                                // rev_ctr32
+	fmov	d1, x10                               // CTR block 1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	add	w12, w12, #1                            // increment rev_ctr32
+	rev	w9, w12                                 // CTR block 1
+	fmov	d3, x10                               // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	ldr	q19, [x8, #16]                                 // load rk1
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	ldr	q20, [x8, #32]                                 // load rk2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	fmov	v3.d[1], x9                               // CTR block 3
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q21, [x8, #48]                                 // load rk3
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q24, [x8, #96]                                 // load rk6
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q23, [x8, #80]                                 // load rk5
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q22, [x8, #64]                                 // load rk4
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	ldr	q26, [x8, #128]                                // load rk8
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	add	w12, w12, #1                            // CTR block 3
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	b.lt	Lenc_finish_first_blocks                         // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	b.eq	Lenc_finish_first_blocks                         // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Lenc_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	b.ge	Lenc_tail                                        // handle tail
+
+	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
+	rev	w9, w12                                 // CTR block 4
+	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
+	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
+	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
+	add	x0, x0, #64                       // AES input_ptr update
+	eor	x19, x19, x13                      // AES block 1 - round N low
+	eor	x20, x20, x14                      // AES block 1 - round N high
+	fmov	d5, x19                               // AES block 1 - mov low
+	eor	x6, x6, x13                      // AES block 0 - round N low
+	eor	x7, x7, x14                      // AES block 0 - round N high
+	eor	x24, x24, x14                      // AES block 3 - round N high
+	fmov	d4, x6                               // AES block 0 - mov low
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	fmov	v4.d[1], x7                           // AES block 0 - mov high
+	eor	x23, x23, x13                      // AES block 3 - round N low
+	eor	x21, x21, x13                      // AES block 2 - round N low
+	fmov	v5.d[1], x20                           // AES block 1 - mov high
+	fmov	d6, x21                               // AES block 2 - mov low
+	add	w12, w12, #1                            // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	d7, x23                               // AES block 3 - mov low
+	eor	x22, x22, x14                      // AES block 2 - round N high
+	fmov	v6.d[1], x22                           // AES block 2 - mov high
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
+	fmov	d0, x10                               // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
+	fmov	d1, x10                               // CTR block 5
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
+	fmov	v7.d[1], x24                           // AES block 3 - mov high
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
+	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
+	add	w12, w12, #1                            // CTR block 6
+	fmov	d2, x10                               // CTR block 6
+	fmov	v2.d[1], x9                               // CTR block 6
+	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
+	rev	w9, w12                                 // CTR block 7
+	orr	x9, x11, x9, lsl #32            // CTR block 7
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
+	b.ge	Lenc_prepretail                                  // do prepretail
+
+Lenc_main_loop:	//	main loop start
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x23, x23, x13                      // AES block 4k+7 - round N low
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	eor	x22, x22, x14                      // AES block 4k+6 - round N high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	x19, x19, x13                      // AES block 4k+5 - round N low
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	eor	x21, x21, x13                      // AES block 4k+6 - round N low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	movi	v8.8b, #0xc2
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	fmov	d5, x19                               // AES block 4k+5 - mov low
+	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
+	b.lt	Lenc_main_loop_continue                          // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Lenc_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Lenc_main_loop_continue:
+	shl	d8, d8, #56               // mod_constant
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	add	x0, x0, #64                       // AES input_ptr update
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	rev	w9, w12                                 // CTR block 4k+8
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
+	eor	x20, x20, x14                      // AES block 4k+5 - round N high
+	eor	x24, x24, x14                      // AES block 4k+7 - round N high
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	fmov	d7, x23                               // AES block 4k+7 - mov low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
+	fmov	d6, x21                               // AES block 4k+6 - mov low
+	cmp	x0, x5                   // LOOP CONTROL
+	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
+	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
+	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	rev	w9, w12                                 // CTR block 4k+9
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
+	fmov	d1, x10                               // CTR block 4k+9
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	rev	w9, w12                                 // CTR block 4k+10
+	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
+	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
+	fmov	d2, x10                               // CTR block 4k+10
+	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
+	fmov	v2.d[1], x9                               // CTR block 4k+10
+	rev	w9, w12                                 // CTR block 4k+11
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
+	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
+	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
+	b.lt	Lenc_main_loop
+
+Lenc_prepretail:	//	PREPRETAIL
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	d3, x10                               // CTR block 4k+3
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
+	fmov	v3.d[1], x9                               // CTR block 4k+3
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	add	w12, w12, #1                            // CTR block 4k+3
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
+	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	pmull	v4.1q, v9.1d, v8.1d
+	ext	v9.16b, v9.16b, v9.16b, #8
+	eor	v10.16b, v10.16b, v11.16b
+	b.lt	Lenc_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	b.eq	Lenc_finish_prepretail                           // branch if AES-192
+
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+
+Lenc_finish_prepretail:
+	eor	v10.16b, v10.16b, v4.16b
+	eor	v10.16b, v10.16b, v9.16b
+	pmull	v4.1q, v10.1d, v8.1d
+	ext	v10.16b, v10.16b, v10.16b, #8
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	eor	v11.16b, v11.16b, v4.16b
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b
+
+Lenc_tail:	//	TAIL
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
+	eor	x6, x6, x13                      // AES block 4k+4 - round N low
+	eor	x7, x7, x14                      // AES block 4k+4 - round N high
+	cmp	x5, #48
+	fmov	d4, x6                               // AES block 4k+4 - mov low
+	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
+	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
+	b.gt	Lenc_blocks_more_than_3
+	cmp	x5, #32
+	mov	v3.16b, v2.16b
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	sub	w12, w12, #1
+	mov	v2.16b, v1.16b
+	movi	v10.8b, #0
+	b.gt	Lenc_blocks_more_than_2
+	mov	v3.16b, v1.16b
+	sub	w12, w12, #1
+	cmp	x5, #16
+	b.gt	Lenc_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Lenc_blocks_less_than_1
+Lenc_blocks_more_than_3:	//	blocks left >  3
+	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
+	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	eor	x6, x6, x13                     // AES final-2 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	x7, x7, x14                     // AES final-2 block - round N high
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	fmov	d5, x6                                // AES final-2 block - mov low
+	fmov	v5.d[1], x7                            // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
+Lenc_blocks_more_than_2:	//	blocks left >  2
+	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
+	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	eor	x6, x6, x13                     // AES final-1 block - round N low
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	fmov	d5, x6                                // AES final-1 block - mov low
+	eor	x7, x7, x14                     // AES final-1 block - round N high
+	fmov	v5.d[1], x7                            // AES final-1 block - mov high
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+Lenc_blocks_more_than_1:	//	blocks left >  1
+	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	eor	x6, x6, x13                     // AES final block - round N low
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	x7, x7, x14                     // AES final block - round N high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	fmov	d5, x6                                // AES final block - mov low
+	fmov	v5.d[1], x7                            // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+Lenc_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x6, x13, x14, lt
+	csel	x7, x14, xzr, lt
+	fmov	d0, x6                                // ctr0b is mask for last block
+	fmov	v0.d[1], x7
+	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                   // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
+	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
+	mov	d8, v4.d[1]                                 // GHASH final block - mid
+	rev	w9, w12
+	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
+	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
+	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
+	shl	d8, d8, #56              // mod_constant
+	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
+	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
+	str	w9, [x16, #12]                         // store the updated counter
+	st1	{ v5.16b}, [x2]                         // store all 16B
+	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
+	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	aes_gcm_dec_kernel
+
+.def aes_gcm_dec_kernel
+   .type 32
+.endef
+.align	4
+aes_gcm_dec_kernel:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp, #-128]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #32]
+	stp	x23, x24, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+	ldr	w17, [x8, #240]
+	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
+	ldp	x13, x14, [x19]                       // load round N keys
+	ldr	q31, [x19, #-16]                        // load round N-1 keys
+	lsr	x5, x1, #3              // byte_len
+	mov	x15, x5
+	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
+	ldr	q26, [x8, #128]                                // load rk8
+	sub	x5, x5, #1      // byte_len - 1
+	ldr	q25, [x8, #112]                                // load rk7
+	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	add	x4, x0, x1, lsr #3   // end_input_ptr
+	ldr	q24, [x8, #96]                                 // load rk6
+	lsr	x12, x11, #32
+	ldr	q23, [x8, #80]                                 // load rk5
+	orr	w11, w11, w11
+	ldr	q21, [x8, #48]                                 // load rk3
+	add	x5, x5, x0
+	rev	w12, w12                                // rev_ctr32
+	add	w12, w12, #1                            // increment rev_ctr32
+	fmov	d3, x10                               // CTR block 3
+	rev	w9, w12                                 // CTR block 1
+	add	w12, w12, #1                            // CTR block 1
+	fmov	d1, x10                               // CTR block 1
+	orr	x9, x11, x9, lsl #32            // CTR block 1
+	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
+	fmov	v1.d[1], x9                               // CTR block 1
+	rev	w9, w12                                 // CTR block 2
+	add	w12, w12, #1                            // CTR block 2
+	fmov	d2, x10                               // CTR block 2
+	orr	x9, x11, x9, lsl #32            // CTR block 2
+	fmov	v2.d[1], x9                               // CTR block 2
+	rev	w9, w12                                 // CTR block 3
+	orr	x9, x11, x9, lsl #32            // CTR block 3
+	ldr	q18, [x8, #0]                                  // load rk0
+	fmov	v3.d[1], x9                               // CTR block 3
+	add	w12, w12, #1                            // CTR block 3
+	ldr	q22, [x8, #64]                                 // load rk4
+	ldr	q19, [x8, #16]                                 // load rk1
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
+	ldr	q14, [x6, #48]                              // load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
+	ldr	q15, [x6, #80]                              // load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
+	ldr	q13, [x6, #32]                              // load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
+	ldr	q20, [x8, #32]                                 // load rk2
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
+	ld1	{ v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
+	ldr	q27, [x8, #144]                                // load rk9
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
+	ldr	q30, [x8, #192]                               // load rk12
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
+	ldr	q12, [x6]                                   // load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
+	ldr	q28, [x8, #160]                               // load rk10
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
+	ldr	q29, [x8, #176]                               // load rk11
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
+	b.lt	Ldec_finish_first_blocks                         // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
+	b.eq	Ldec_finish_first_blocks                         // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
+
+Ldec_finish_first_blocks:
+	cmp	x0, x5                   // check if we have <= 4 blocks
+	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
+	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
+	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
+	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
+	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
+	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
+	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
+	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
+	b.ge	Ldec_tail                                        // handle tail
+
+	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
+	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
+	rev	w9, w12                                 // CTR block 4
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
+	rev64	v5.16b, v5.16b                                    // GHASH block 1
+	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 0 - mov high
+	mov	x6, v0.d[0]                            // AES block 0 - mov low
+	rev64	v4.16b, v4.16b                                    // GHASH block 0
+	add	w12, w12, #1                            // CTR block 4
+	fmov	d0, x10                               // CTR block 4
+	orr	x9, x11, x9, lsl #32            // CTR block 4
+	fmov	v0.d[1], x9                               // CTR block 4
+	rev	w9, w12                                 // CTR block 5
+	add	w12, w12, #1                            // CTR block 5
+	mov	x19, v1.d[0]                            // AES block 1 - mov low
+	orr	x9, x11, x9, lsl #32            // CTR block 5
+	mov	x20, v1.d[1]                            // AES block 1 - mov high
+	eor	x7, x7, x14                    // AES block 0 - round N high
+	eor	x6, x6, x13                    // AES block 0 - round N low
+	stp	x6, x7, [x2], #16        // AES block 0 - store result
+	fmov	d1, x10                               // CTR block 5
+	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
+	add	x0, x0, #64                       // AES input_ptr update
+	fmov	v1.d[1], x9                               // CTR block 5
+	rev	w9, w12                                 // CTR block 6
+	add	w12, w12, #1                            // CTR block 6
+	eor	x19, x19, x13                    // AES block 1 - round N low
+	orr	x9, x11, x9, lsl #32            // CTR block 6
+	eor	x20, x20, x14                    // AES block 1 - round N high
+	stp	x19, x20, [x2], #16        // AES block 1 - store result
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
+	cmp	x0, x5                   // check if we have <= 8 blocks
+	b.ge	Ldec_prepretail                                  // do prepretail
+
+Ldec_main_loop:	//	main loop start
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev	w9, w12                                 // CTR block 4k+7
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	add	w12, w12, #1                            // CTR block 4k+7
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	rev	w9, w12                                 // CTR block 4k+8
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	add	w12, w12, #1                            // CTR block 4k+8
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	movi	v8.8b, #0xc2
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	b.lt	Ldec_main_loop_continue                          // branch if AES-128
+
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	b.eq	Ldec_main_loop_continue                          // branch if AES-192
+
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_main_loop_continue:
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
+	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	add	x0, x0, #64                       // AES input_ptr update
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	fmov	d0, x10                               // CTR block 4k+8
+	fmov	v0.d[1], x9                               // CTR block 4k+8
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
+	rev	w9, w12                                 // CTR block 4k+9
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
+	cmp	x0, x5                   // LOOP CONTROL
+	add	w12, w12, #1                            // CTR block 4k+9
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
+	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
+	fmov	d1, x10                               // CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	fmov	v1.d[1], x9                               // CTR block 4k+9
+	rev	w9, w12                                 // CTR block 4k+10
+	add	w12, w12, #1                            // CTR block 4k+10
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
+	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
+	eor	x20, x20, x14                    // AES block 4k+5 - round N high
+	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
+	eor	x19, x19, x13                    // AES block 4k+5 - round N low
+	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
+	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	b.lt	Ldec_main_loop
+
+Ldec_prepretail:	//	PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
+	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
+	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
+	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
+	fmov	d2, x10                               // CTR block 4k+6
+	fmov	v2.d[1], x9                               // CTR block 4k+6
+	rev	w9, w12                                 // CTR block 4k+7
+	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
+	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
+	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
+	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
+	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
+	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
+	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
+	fmov	d3, x10                               // CTR block 4k+7
+	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
+	fmov	v3.d[1], x9                               // CTR block 4k+7
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                               // GHASH block 4k - mid
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
+	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
+	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
+	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
+	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
+	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
+	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
+	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
+	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
+	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
+	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
+	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
+	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
+	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
+	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
+	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
+	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
+	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
+	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
+	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
+	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
+	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
+	movi	v8.8b, #0xc2
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
+	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
+	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
+	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
+	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
+	shl	d8, d8, #56               // mod_constant
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
+	b.lt	Ldec_finish_prepretail                           // branch if AES-128
+
+	aese	v1.16b, v27.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
+	aese	v3.16b, v27.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
+	aese	v0.16b, v27.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
+	aese	v2.16b, v28.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
+	aese	v3.16b, v28.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
+	aese	v0.16b, v28.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
+	aese	v1.16b, v28.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
+	b.eq	Ldec_finish_prepretail                           // branch if AES-192
+
+	aese	v2.16b, v29.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
+	aese	v0.16b, v29.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
+	aese	v1.16b, v29.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
+	aese	v2.16b, v30.16b
+	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
+	aese	v3.16b, v29.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
+	aese	v1.16b, v30.16b
+	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
+	aese	v0.16b, v30.16b
+	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
+	aese	v3.16b, v30.16b
+	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
+
+Ldec_finish_prepretail:
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	x22, x22, x14                    // AES block 4k+2 - round N high
+	eor	x23, x23, x13                    // AES block 4k+3 - round N low
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	add	w12, w12, #1                            // CTR block 4k+7
+	eor	x21, x21, x13                    // AES block 4k+2 - round N low
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	eor	x24, x24, x14                    // AES block 4k+3 - round N high
+	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
+
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
+	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
+	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
+	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+
+Ldec_tail:	//	TAIL
+	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
+	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
+	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
+	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
+	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
+	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
+	cmp	x5, #48
+	eor	x6, x6, x13                    // AES block 4k+4 - round N low
+	eor	x7, x7, x14                    // AES block 4k+4 - round N high
+	b.gt	Ldec_blocks_more_than_3
+	sub	w12, w12, #1
+	mov	v3.16b, v2.16b
+	movi	v10.8b, #0
+	movi	v11.8b, #0
+	cmp	x5, #32
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	b.gt	Ldec_blocks_more_than_2
+	sub	w12, w12, #1
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	b.gt	Ldec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	Ldec_blocks_less_than_1
+Ldec_blocks_more_than_3:	//	blocks left >  3
+	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
+	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
+	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
+	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
+	mov	x6, v0.d[0]                           // AES final-2 block - mov low
+	mov	x7, v0.d[1]                           // AES final-2 block - mov high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
+	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
+	eor	x6, x6, x13                   // AES final-2 block - round N low
+	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
+	eor	x7, x7, x14                   // AES final-2 block - round N high
+Ldec_blocks_more_than_2:	//	blocks left >  2
+	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
+	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
+	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
+	mov	x6, v0.d[0]                           // AES final-1 block - mov low
+	mov	x7, v0.d[1]                           // AES final-1 block - mov high
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
+	eor	x6, x6, x13                   // AES final-1 block - round N low
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
+	eor	x7, x7, x14                   // AES final-1 block - round N high
+Ldec_blocks_more_than_1:	//	blocks left >  1
+	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
+	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
+	movi	v8.8b, #0                                       // suppress further partial tag feed in
+	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
+	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
+	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
+	mov	x6, v0.d[0]                           // AES final block - mov low
+	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
+	mov	x7, v0.d[1]                           // AES final block - mov high
+	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
+	eor	x6, x6, x13                   // AES final block - round N low
+	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
+	eor	x7, x7, x14                   // AES final block - round N high
+Ldec_blocks_less_than_1:	//	blocks left <= 1
+	and	x1, x1, #127                   // bit_length %= 128
+	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
+	sub	x1, x1, #128                   // bit_length -= 128
+	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
+	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
+	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
+	and	x1, x1, #127                   // bit_length %= 128
+	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x9, x13, x14, lt
+	csel	x10, x14, xzr, lt
+	fmov	d0, x9                                  // ctr0b is mask for last block
+	and	x6, x6, x9
+	mov	v0.d[1], x10
+	bic	x4, x4, x9          // mask out low existing bytes
+	rev	w9, w12
+	bic	x5, x5, x10      // mask out high existing bytes
+	orr	x6, x6, x4
+	and	x7, x7, x10
+	orr	x7, x7, x5
+	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
+	rev64	v4.16b, v5.16b                                    // GHASH final block
+	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
+	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
+	mov	d8, v4.d[1]                                  // GHASH final block - mid
+	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
+	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
+	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
+	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
+	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
+	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
+	movi	v8.8b, #0xc2
+	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
+	shl	d8, d8, #56               // mod_constant
+	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
+	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
+	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
+	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
+	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
+	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
+	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
+	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
+	stp	x6, x7, [x2]
+	str	w9, [x16, #12]                          // store the updated counter
+	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x29, x30, [sp], #128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/armv4-mont-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-linux.linux.arm.S
rename to Sources/CCryptoBoringSSL/gen/bcm/armv4-mont-linux.S
index bbec47b3..d321b10f 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv4-mont-linux.linux.arm.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/armv4-mont-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -939,7 +938,6 @@ bn_mul8x_mont_neon:
 .byte	77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
-#endif  // defined(__arm__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-apple.S
index 82b43294..594fdbde 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1425,7 +1424,6 @@ Lmul4x_done:
 .align	2
 .align	4
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-linux.S
index 5d56f5b2..03137764 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/armv8-mont-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1425,7 +1424,6 @@ __bn_mul4x_mont:
 .align	2
 .align	4
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-win.S b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-win.S
new file mode 100644
index 00000000..f12bf106
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/armv8-mont-win.S
@@ -0,0 +1,1436 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <CCryptoBoringSSL_arm_arch.h>
+
+.text
+
+.globl	bn_mul_mont
+
+.def bn_mul_mont
+   .type 32
+.endef
+.align	5
+bn_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	tst	x5,#7
+	b.eq	__bn_sqr8x_mont
+	tst	x5,#3
+	b.eq	__bn_mul4x_mont
+Lmul_mont:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	x9,[x2],#8		// bp[0]
+	sub	x22,sp,x5,lsl#3
+	ldp	x7,x8,[x1],#16	// ap[0..1]
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	and	x22,x22,#-16		// ABI says so
+	ldp	x13,x14,[x3],#16	// np[0..1]
+
+	mul	x6,x7,x9		// ap[0]*bp[0]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	mul	x10,x8,x9		// ap[1]*bp[0]
+	umulh	x11,x8,x9
+
+	mul	x15,x6,x4		// "tp[0]"*n0
+	mov	sp,x22			// alloca
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	x6 being non-zero. So that carry can be calculated
+	//	by adding -1 to x6. That's what next instruction does.
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	adc	x13,x13,xzr
+	cbz	x21,L1st_skip
+
+L1st:
+	ldr	x8,[x1],#8
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	ldr	x14,[x3],#8
+	adds	x12,x16,x13
+	mul	x10,x8,x9		// ap[j]*bp[0]
+	adc	x13,x17,xzr
+	umulh	x11,x8,x9
+
+	adds	x12,x12,x6
+	mul	x16,x14,x15		// np[j]*m1
+	adc	x13,x13,xzr
+	umulh	x17,x14,x15
+	str	x12,[x22],#8		// tp[j-1]
+	cbnz	x21,L1st
+
+L1st_skip:
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adc	x13,x17,xzr
+
+	adds	x12,x12,x6
+	sub	x20,x5,#8		// i=num-1
+	adcs	x13,x13,x7
+
+	adc	x19,xzr,xzr		// upmost overflow bit
+	stp	x12,x13,[x22]
+
+Louter:
+	ldr	x9,[x2],#8		// bp[i]
+	ldp	x7,x8,[x1],#16
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+
+	mul	x6,x7,x9		// ap[0]*bp[i]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	ldp	x13,x14,[x3],#16
+	mul	x10,x8,x9		// ap[1]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x15,x6,x4
+	sub	x20,x20,#8		// i--
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	cbz	x21,Linner_skip
+
+Linner:
+	ldr	x8,[x1],#8
+	adc	x13,x13,xzr
+	ldr	x23,[x22],#8		// tp[j]
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	ldr	x14,[x3],#8
+	adc	x13,x17,xzr
+
+	mul	x10,x8,x9		// ap[j]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x16,x14,x15		// np[j]*m1
+	adds	x12,x12,x6
+	umulh	x17,x14,x15
+	str	x12,[x22,#-16]		// tp[j-1]
+	cbnz	x21,Linner
+
+Linner_skip:
+	ldr	x23,[x22],#8		// tp[j]
+	adc	x13,x13,xzr
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adcs	x13,x17,x19
+	adc	x19,xzr,xzr
+
+	adds	x6,x6,x23
+	adc	x7,x7,xzr
+
+	adds	x12,x12,x6
+	adcs	x13,x13,x7
+	adc	x19,x19,xzr		// upmost overflow bit
+	stp	x12,x13,[x22,#-16]
+
+	cbnz	x20,Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x14,[x3],#8		// np[0]
+	subs	x21,x5,#8		// j=num-1 and clear borrow
+	mov	x1,x0
+Lsub:
+	sbcs	x8,x23,x14		// tp[j]-np[j]
+	ldr	x23,[x22],#8
+	sub	x21,x21,#8		// j--
+	ldr	x14,[x3],#8
+	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
+	cbnz	x21,Lsub
+
+	sbcs	x8,x23,x14
+	sbcs	x19,x19,xzr		// did it borrow?
+	str	x8,[x1],#8		// rp[num-1]
+
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x8,[x0],#8		// rp[0]
+	sub	x5,x5,#8		// num--
+	nop
+Lcond_copy:
+	sub	x5,x5,#8		// num--
+	csel	x14,x23,x8,lo		// did it borrow?
+	ldr	x23,[x22],#8
+	ldr	x8,[x0],#8
+	str	xzr,[x22,#-16]		// wipe tp
+	str	x14,[x0,#-16]
+	cbnz	x5,Lcond_copy
+
+	csel	x14,x23,x8,lo
+	str	xzr,[x22,#-8]		// wipe tp
+	str	x14,[x0,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.def __bn_sqr8x_mont
+   .type 32
+.endef
+.align	5
+__bn_sqr8x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+	// only from bn_mul_mont which has already signed the return address.
+	cmp	x1,x2
+	b.ne	__bn_mul4x_mont
+Lsqr8x_mont:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x3,[sp,#96]	// offload rp and np
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	ldp	x12,x13,[x1,#8*6]
+
+	sub	x2,sp,x5,lsl#4
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	mov	sp,x2			// alloca
+	sub	x27,x5,#8*8
+	b	Lsqr8x_zero_start
+
+Lsqr8x_zero:
+	sub	x27,x27,#8*8
+	stp	xzr,xzr,[x2,#8*0]
+	stp	xzr,xzr,[x2,#8*2]
+	stp	xzr,xzr,[x2,#8*4]
+	stp	xzr,xzr,[x2,#8*6]
+Lsqr8x_zero_start:
+	stp	xzr,xzr,[x2,#8*8]
+	stp	xzr,xzr,[x2,#8*10]
+	stp	xzr,xzr,[x2,#8*12]
+	stp	xzr,xzr,[x2,#8*14]
+	add	x2,x2,#8*16
+	cbnz	x27,Lsqr8x_zero
+
+	add	x3,x1,x5
+	add	x1,x1,#8*8
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	mov	x23,xzr
+	mov	x24,xzr
+	mov	x25,xzr
+	mov	x26,xzr
+	mov	x2,sp
+	str	x4,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
+	mul	x15,x8,x6
+	mul	x16,x9,x6
+	mul	x17,x10,x6
+	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
+	mul	x14,x11,x6
+	adcs	x21,x21,x15
+	mul	x15,x12,x6
+	adcs	x22,x22,x16
+	mul	x16,x13,x6
+	adcs	x23,x23,x17
+	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
+	adcs	x24,x24,x14
+	umulh	x14,x8,x6
+	adcs	x25,x25,x15
+	umulh	x15,x9,x6
+	adcs	x26,x26,x16
+	umulh	x16,x10,x6
+	stp	x19,x20,[x2],#8*2	// t[0..1]
+	adc	x19,xzr,xzr		// t[8]
+	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
+	umulh	x17,x11,x6
+	adcs	x22,x22,x14
+	umulh	x14,x12,x6
+	adcs	x23,x23,x15
+	umulh	x15,x13,x6
+	adcs	x24,x24,x16
+	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
+	adcs	x25,x25,x17
+	mul	x17,x9,x7
+	adcs	x26,x26,x14
+	mul	x14,x10,x7
+	adc	x19,x19,x15
+
+	mul	x15,x11,x7
+	adds	x22,x22,x16
+	mul	x16,x12,x7
+	adcs	x23,x23,x17
+	mul	x17,x13,x7
+	adcs	x24,x24,x14
+	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
+	adcs	x25,x25,x15
+	umulh	x15,x9,x7
+	adcs	x26,x26,x16
+	umulh	x16,x10,x7
+	adcs	x19,x19,x17
+	umulh	x17,x11,x7
+	stp	x21,x22,[x2],#8*2	// t[2..3]
+	adc	x20,xzr,xzr		// t[9]
+	adds	x23,x23,x14
+	umulh	x14,x12,x7
+	adcs	x24,x24,x15
+	umulh	x15,x13,x7
+	adcs	x25,x25,x16
+	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
+	adcs	x26,x26,x17
+	mul	x17,x10,x8
+	adcs	x19,x19,x14
+	mul	x14,x11,x8
+	adc	x20,x20,x15
+
+	mul	x15,x12,x8
+	adds	x24,x24,x16
+	mul	x16,x13,x8
+	adcs	x25,x25,x17
+	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
+	adcs	x26,x26,x14
+	umulh	x14,x10,x8
+	adcs	x19,x19,x15
+	umulh	x15,x11,x8
+	adcs	x20,x20,x16
+	umulh	x16,x12,x8
+	stp	x23,x24,[x2],#8*2	// t[4..5]
+	adc	x21,xzr,xzr		// t[10]
+	adds	x25,x25,x17
+	umulh	x17,x13,x8
+	adcs	x26,x26,x14
+	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
+	adcs	x19,x19,x15
+	mul	x15,x11,x9
+	adcs	x20,x20,x16
+	mul	x16,x12,x9
+	adc	x21,x21,x17
+
+	mul	x17,x13,x9
+	adds	x26,x26,x14
+	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
+	adcs	x19,x19,x15
+	umulh	x15,x11,x9
+	adcs	x20,x20,x16
+	umulh	x16,x12,x9
+	adcs	x21,x21,x17
+	umulh	x17,x13,x9
+	stp	x25,x26,[x2],#8*2	// t[6..7]
+	adc	x22,xzr,xzr		// t[11]
+	adds	x19,x19,x14
+	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
+	adcs	x20,x20,x15
+	mul	x15,x12,x10
+	adcs	x21,x21,x16
+	mul	x16,x13,x10
+	adc	x22,x22,x17
+
+	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
+	adds	x20,x20,x14
+	umulh	x14,x12,x10
+	adcs	x21,x21,x15
+	umulh	x15,x13,x10
+	adcs	x22,x22,x16
+	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
+	adc	x23,xzr,xzr		// t[12]
+	adds	x21,x21,x17
+	mul	x17,x13,x11
+	adcs	x22,x22,x14
+	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
+	adc	x23,x23,x15
+
+	umulh	x15,x13,x11
+	adds	x22,x22,x16
+	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
+	adcs	x23,x23,x17
+	umulh	x17,x13,x12		// hi(a[7]*a[6])
+	adc	x24,xzr,xzr		// t[13]
+	adds	x23,x23,x14
+	sub	x27,x3,x1	// done yet?
+	adc	x24,x24,x15
+
+	adds	x24,x24,x16
+	sub	x14,x3,x5	// rewinded ap
+	adc	x25,xzr,xzr		// t[14]
+	add	x25,x25,x17
+
+	cbz	x27,Lsqr8x_outer_break
+
+	mov	x4,x6
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x0,x1
+	adcs	x26,xzr,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved below
+	mov	x27,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+Lsqr8x_mul:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	x1,x3		// done yet?
+	b.eq	Lsqr8x_break
+
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	ldr	x4,[x0,#-8*8]
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_mul
+
+.align	4
+Lsqr8x_break:
+	ldp	x6,x7,[x0,#8*0]
+	add	x1,x0,#8*8
+	ldp	x8,x9,[x0,#8*2]
+	sub	x14,x3,x1		// is it last iteration?
+	ldp	x10,x11,[x0,#8*4]
+	sub	x15,x2,x14
+	ldp	x12,x13,[x0,#8*6]
+	cbz	x14,Lsqr8x_outer_loop
+
+	stp	x19,x20,[x2,#8*0]
+	ldp	x19,x20,[x15,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x15,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x15,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x15
+	ldp	x25,x26,[x15,#8*6]
+	b	Lsqr8x_outer_loop
+
+.align	4
+Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
+	ldp	x15,x16,[sp,#8*1]
+	ldp	x11,x13,[x14,#8*2]
+	add	x1,x14,#8*4
+	ldp	x17,x14,[sp,#8*3]
+
+	stp	x19,x20,[x2,#8*0]
+	mul	x19,x7,x7
+	stp	x21,x22,[x2,#8*2]
+	umulh	x7,x7,x7
+	stp	x23,x24,[x2,#8*4]
+	mul	x8,x9,x9
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,sp
+	umulh	x9,x9,x9
+	adds	x20,x7,x15,lsl#1
+	extr	x15,x16,x15,#63
+	sub	x27,x5,#8*4
+
+Lsqr4x_shift_n_add:
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	sub	x27,x27,#8*4
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	ldp	x7,x9,[x1],#8*2
+	umulh	x11,x11,x11
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	extr	x17,x14,x17,#63
+	stp	x19,x20,[x2,#8*0]
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	stp	x21,x22,[x2,#8*2]
+	adcs	x24,x11,x14
+	ldp	x17,x14,[x2,#8*7]
+	extr	x15,x16,x15,#63
+	adcs	x25,x12,x15
+	extr	x16,x17,x16,#63
+	adcs	x26,x13,x16
+	ldp	x15,x16,[x2,#8*9]
+	mul	x6,x7,x7
+	ldp	x11,x13,[x1],#8*2
+	umulh	x7,x7,x7
+	mul	x8,x9,x9
+	umulh	x9,x9,x9
+	stp	x23,x24,[x2,#8*4]
+	extr	x17,x14,x17,#63
+	stp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	adcs	x19,x6,x17
+	extr	x14,x15,x14,#63
+	adcs	x20,x7,x14
+	ldp	x17,x14,[x2,#8*3]
+	extr	x15,x16,x15,#63
+	cbnz	x27,Lsqr4x_shift_n_add
+	ldp	x1,x4,[x29,#104]	// pull np and n0
+
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	umulh	x11,x11,x11
+	stp	x19,x20,[x2,#8*0]
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	stp	x21,x22,[x2,#8*2]
+	extr	x17,x14,x17,#63
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	ldp	x19,x20,[sp,#8*0]
+	adcs	x24,x11,x14
+	extr	x15,x16,x15,#63
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x25,x12,x15
+	extr	x16,xzr,x16,#63
+	ldp	x8,x9,[x1,#8*2]
+	adc	x26,x13,x16
+	ldp	x10,x11,[x1,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	x28,x4,x19		// t[0]*n0
+	ldp	x12,x13,[x1,#8*6]
+	add	x3,x1,x5
+	ldp	x21,x22,[sp,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[sp,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	ldp	x25,x26,[sp,#8*6]
+	add	x1,x1,#8*8
+	mov	x30,xzr		// initial top-most carry
+	mov	x2,sp
+	mov	x27,#8
+
+Lsqr8x_reduction:
+	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
+	mul	x15,x7,x28
+	sub	x27,x27,#1
+	mul	x16,x8,x28
+	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
+	mul	x17,x9,x28
+	// (*)	adds	xzr,x19,x14
+	subs	xzr,x19,#1		// (*)
+	mul	x14,x10,x28
+	adcs	x19,x20,x15
+	mul	x15,x11,x28
+	adcs	x20,x21,x16
+	mul	x16,x12,x28
+	adcs	x21,x22,x17
+	mul	x17,x13,x28
+	adcs	x22,x23,x14
+	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	x23,x24,x15
+	umulh	x15,x7,x28
+	adcs	x24,x25,x16
+	umulh	x16,x8,x28
+	adcs	x25,x26,x17
+	umulh	x17,x9,x28
+	adc	x26,xzr,xzr
+	adds	x19,x19,x14
+	umulh	x14,x10,x28
+	adcs	x20,x20,x15
+	umulh	x15,x11,x28
+	adcs	x21,x21,x16
+	umulh	x16,x12,x28
+	adcs	x22,x22,x17
+	umulh	x17,x13,x28
+	mul	x28,x4,x19		// next t[0]*n0
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adc	x26,x26,x17
+	cbnz	x27,Lsqr8x_reduction
+
+	ldp	x14,x15,[x2,#8*0]
+	ldp	x16,x17,[x2,#8*2]
+	mov	x0,x2
+	sub	x27,x3,x1	// done yet?
+	adds	x19,x19,x14
+	adcs	x20,x20,x15
+	ldp	x14,x15,[x2,#8*4]
+	adcs	x21,x21,x16
+	adcs	x22,x22,x17
+	ldp	x16,x17,[x2,#8*6]
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adcs	x26,x26,x17
+	//adc	x28,xzr,xzr		// moved below
+	cbz	x27,Lsqr8x8_post_condition
+
+	ldr	x4,[x2,#-8*8]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	mov	x27,#-8*8
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+
+Lsqr8x_tail:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	x6,x7,[x2,#8*0]
+	sub	x27,x3,x1	// done yet?
+	sub	x16,x3,x5	// rewinded np
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	cbz	x27,Lsqr8x_tail_break
+
+	ldr	x4,[x0,#-8*8]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_tail
+
+.align	4
+Lsqr8x_tail_break:
+	ldr	x4,[x29,#112]		// pull n0
+	add	x27,x2,#8*8		// end of current t[num] window
+
+	subs	xzr,x30,#1		// "move" top-most carry to carry bit
+	adcs	x14,x19,x6
+	adcs	x15,x20,x7
+	ldp	x19,x20,[x0,#8*0]
+	adcs	x21,x21,x8
+	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x16,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x16,#8*4]
+	adcs	x25,x25,x12
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x16,#8*6]
+	add	x1,x16,#8*8
+	adc	x30,xzr,xzr	// top-most carry
+	mul	x28,x4,x19
+	stp	x14,x15,[x2,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x0,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x0,#8*4]
+	cmp	x27,x29		// did we hit the bottom?
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x0			// slide the window
+	ldp	x25,x26,[x0,#8*6]
+	mov	x27,#8
+	b.ne	Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x0,[x29,#96]		// pull rp
+	add	x2,x2,#8*8
+	subs	x14,x19,x6
+	sbcs	x15,x20,x7
+	sub	x27,x5,#8*8
+	mov	x3,x0		// x0 copy
+
+Lsqr8x_sub:
+	sbcs	x16,x21,x8
+	ldp	x6,x7,[x1,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x1,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x10,x11,[x1,#8*4]
+	sbcs	x17,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	ldp	x19,x20,[x2,#8*0]
+	sub	x27,x27,#8*8
+	ldp	x21,x22,[x2,#8*2]
+	ldp	x23,x24,[x2,#8*4]
+	ldp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	stp	x14,x15,[x0,#8*4]
+	sbcs	x14,x19,x6
+	stp	x16,x17,[x0,#8*6]
+	add	x0,x0,#8*8
+	sbcs	x15,x20,x7
+	cbnz	x27,Lsqr8x_sub
+
+	sbcs	x16,x21,x8
+	mov	x2,sp
+	add	x1,sp,x5
+	ldp	x6,x7,[x3,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x3,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x19,x20,[x1,#8*0]
+	sbcs	x17,x26,x13
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	x14,x15,[x0,#8*4]
+	stp	x16,x17,[x0,#8*6]
+
+	sub	x27,x5,#8*4
+Lsqr4x_cond_copy:
+	sub	x27,x27,#8*4
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	ldp	x6,x7,[x3,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x16,x21,x8,lo
+	stp	xzr,xzr,[x2,#8*2]
+	add	x2,x2,#8*4
+	csel	x17,x22,x9,lo
+	ldp	x8,x9,[x3,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	stp	xzr,xzr,[x1,#8*0]
+	stp	xzr,xzr,[x1,#8*2]
+	cbnz	x27,Lsqr4x_cond_copy
+
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	stp	xzr,xzr,[x2,#8*2]
+	csel	x16,x21,x8,lo
+	csel	x17,x22,x9,lo
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+
+	b	Lsqr8x_done
+
+.align	4
+Lsqr8x8_post_condition:
+	adc	x28,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// x19-7,x28 hold result, x6-7 hold modulus
+	subs	x6,x19,x6
+	ldr	x1,[x29,#96]		// pull rp
+	sbcs	x7,x20,x7
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x8
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x9
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	x10,x23,x10
+	stp	xzr,xzr,[sp,#8*6]
+	sbcs	x11,x24,x11
+	stp	xzr,xzr,[sp,#8*8]
+	sbcs	x12,x25,x12
+	stp	xzr,xzr,[sp,#8*10]
+	sbcs	x13,x26,x13
+	stp	xzr,xzr,[sp,#8*12]
+	sbcs	x28,x28,xzr	// did it borrow?
+	stp	xzr,xzr,[sp,#8*14]
+
+	// x6-7 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	csel	x10,x23,x10,lo
+	csel	x11,x24,x11,lo
+	stp	x8,x9,[x1,#8*2]
+	csel	x12,x25,x12,lo
+	csel	x13,x26,x13,lo
+	stp	x10,x11,[x1,#8*4]
+	stp	x12,x13,[x1,#8*6]
+
+Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.def __bn_mul4x_mont
+   .type 32
+.endef
+.align	5
+__bn_mul4x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+	// return address.
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	x26,sp,x5,lsl#3
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	sub	sp,x26,#8*4		// alloca
+
+	add	x10,x2,x5
+	add	x27,x1,x5
+	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
+
+	ldr	x24,[x2,#8*0]		// b[0]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x28,#0
+	mov	x26,sp
+
+Loop_mul4x_1st_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[0])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	sub	x10,x27,x1
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_reduction
+
+	cbz	x10,Lmul4x4_post_condition
+
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldr	x25,[sp]		// a[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+Loop_mul4x_1st_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[i])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	adcs	x23,x23,x0
+	umulh	x13,x17,x25
+	adc	x0,xzr,xzr
+	ldr	x25,[sp,x28]		// next t[0]*n0
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_tail
+
+	sub	x11,x27,x5	// rewinded x1
+	cbz	x10,Lmul4x_proceed
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_1st_tail
+
+.align	5
+Lmul4x_proceed:
+	ldr	x24,[x2,#8*4]!		// *++b
+	adc	x30,x0,xzr
+	ldp	x6,x7,[x11,#8*0]	// a[0..3]
+	sub	x3,x3,x5		// rewind np
+	ldp	x8,x9,[x11,#8*2]
+	add	x1,x11,#8*4
+
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	ldp	x21,x22,[sp,#8*6]
+
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	mov	x26,sp
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+
+.align	4
+Loop_mul4x_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_reduction
+
+	adc	x0,x0,xzr
+	ldp	x10,x11,[x26,#8*4]	// t[4..7]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+
+	ldr	x25,[sp]		// t[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.align	4
+Loop_mul4x_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	umulh	x13,x17,x25
+	adcs	x23,x23,x0
+	ldr	x25,[sp,x28]		// next a[0]*n0
+	adc	x0,xzr,xzr
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_tail
+
+	sub	x11,x3,x5		// rewinded np?
+	adc	x0,x0,xzr
+	cbz	x10,Loop_mul4x_break
+
+	ldp	x10,x11,[x26,#8*4]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_tail
+
+.align	4
+Loop_mul4x_break:
+	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
+	adds	x19,x19,x30
+	add	x2,x2,#8*4		// bp++
+	adcs	x20,x20,xzr
+	sub	x1,x1,x5		// rewind ap
+	adcs	x21,x21,xzr
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	adcs	x22,x22,xzr
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	adc	x30,x0,xzr
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	cmp	x2,x13			// done yet?
+	ldp	x21,x22,[sp,#8*6]
+	ldp	x14,x15,[x11,#8*0]	// n[0..3]
+	ldp	x16,x17,[x11,#8*2]
+	add	x3,x11,#8*4
+	b.eq	Lmul4x_post
+
+	ldr	x24,[x2]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	adds	x1,x1,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x26,sp
+	b	Loop_mul4x_reduction
+
+.align	4
+Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	x0,x12
+	mov	x27,x12		// x0 copy
+	subs	x10,x19,x14
+	add	x26,sp,#8*8
+	sbcs	x11,x20,x15
+	sub	x28,x5,#8*4
+
+Lmul4x_sub:
+	sbcs	x12,x21,x16
+	ldp	x14,x15,[x3,#8*0]
+	sub	x28,x28,#8*4
+	ldp	x19,x20,[x26,#8*0]
+	sbcs	x13,x22,x17
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	ldp	x21,x22,[x26,#8*2]
+	add	x26,x26,#8*4
+	stp	x10,x11,[x0,#8*0]
+	sbcs	x10,x19,x14
+	stp	x12,x13,[x0,#8*2]
+	add	x0,x0,#8*4
+	sbcs	x11,x20,x15
+	cbnz	x28,Lmul4x_sub
+
+	sbcs	x12,x21,x16
+	mov	x26,sp
+	add	x1,sp,#8*4
+	ldp	x6,x7,[x27,#8*0]
+	sbcs	x13,x22,x17
+	stp	x10,x11,[x0,#8*0]
+	ldp	x8,x9,[x27,#8*2]
+	stp	x12,x13,[x0,#8*2]
+	ldp	x19,x20,[x1,#8*0]
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	x28,x5,#8*4
+Lmul4x_cond_copy:
+	sub	x28,x28,#8*4
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	ldp	x6,x7,[x27,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*2]
+	add	x26,x26,#8*4
+	csel	x13,x22,x9,lo
+	ldp	x8,x9,[x27,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+	add	x27,x27,#8*4
+	cbnz	x28,Lmul4x_cond_copy
+
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	stp	xzr,xzr,[x26,#8*2]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*3]
+	csel	x13,x22,x9,lo
+	stp	xzr,xzr,[x26,#8*4]
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+
+	b	Lmul4x_done
+
+.align	4
+Lmul4x4_post_condition:
+	adc	x0,x0,xzr
+	ldr	x1,[x29,#96]		// pull rp
+	// x19-3,x0 hold result, x14-7 hold modulus
+	subs	x6,x19,x14
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	x7,x20,x15
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x16
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x17
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,x0,xzr		// did it borrow?
+	stp	xzr,xzr,[sp,#8*6]
+
+	// x6-3 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	stp	x8,x9,[x1,#8*2]
+
+Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	4
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/bn-586-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/bn-586-apple.S
new file mode 100644
index 00000000..41d93acb
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/bn-586-apple.S
@@ -0,0 +1,536 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_bn_mul_add_words
+.private_extern	_bn_mul_add_words
+.align	4
+_bn_mul_add_words:
+L_bn_mul_add_words_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+	jmp	L000maw_sse2_entry
+.align	4,0x90
+L001maw_sse2_unrolled:
+	movd	(%eax),%mm3
+	paddq	%mm3,%mm1
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	movd	4(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	movd	8(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	12(%edx),%mm7
+	pmuludq	%mm0,%mm7
+	paddq	%mm2,%mm1
+	movd	4(%eax),%mm3
+	paddq	%mm4,%mm3
+	movd	8(%eax),%mm5
+	paddq	%mm6,%mm5
+	movd	12(%eax),%mm4
+	paddq	%mm4,%mm7
+	movd	%mm1,(%eax)
+	movd	16(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	psrlq	$32,%mm1
+	movd	20(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	paddq	%mm3,%mm1
+	movd	24(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	%mm1,4(%eax)
+	psrlq	$32,%mm1
+	movd	28(%edx),%mm3
+	addl	$32,%edx
+	pmuludq	%mm0,%mm3
+	paddq	%mm5,%mm1
+	movd	16(%eax),%mm5
+	paddq	%mm5,%mm2
+	movd	%mm1,8(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm7,%mm1
+	movd	20(%eax),%mm5
+	paddq	%mm5,%mm4
+	movd	%mm1,12(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm2,%mm1
+	movd	24(%eax),%mm5
+	paddq	%mm5,%mm6
+	movd	%mm1,16(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm4,%mm1
+	movd	28(%eax),%mm5
+	paddq	%mm5,%mm3
+	movd	%mm1,20(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm6,%mm1
+	movd	%mm1,24(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm3,%mm1
+	movd	%mm1,28(%eax)
+	leal	32(%eax),%eax
+	psrlq	$32,%mm1
+	subl	$8,%ecx
+	jz	L002maw_sse2_exit
+L000maw_sse2_entry:
+	testl	$4294967288,%ecx
+	jnz	L001maw_sse2_unrolled
+.align	2,0x90
+L003maw_sse2_loop:
+	movd	(%edx),%mm2
+	movd	(%eax),%mm3
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm3,%mm1
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	L003maw_sse2_loop
+L002maw_sse2_exit:
+	movd	%mm1,%eax
+	emms
+	ret
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_bn_mul_words
+.private_extern	_bn_mul_words
+.align	4
+_bn_mul_words:
+L_bn_mul_words_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+.align	4,0x90
+L004mw_sse2_loop:
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	L004mw_sse2_loop
+	movd	%mm1,%eax
+	emms
+	ret
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_bn_sqr_words
+.private_extern	_bn_sqr_words
+.align	4
+_bn_sqr_words:
+L_bn_sqr_words_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+.align	4,0x90
+L005sqr_sse2_loop:
+	movd	(%edx),%mm0
+	pmuludq	%mm0,%mm0
+	leal	4(%edx),%edx
+	movq	%mm0,(%eax)
+	subl	$1,%ecx
+	leal	8(%eax),%eax
+	jnz	L005sqr_sse2_loop
+	emms
+	ret
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_bn_div_words
+.private_extern	_bn_div_words
+.align	4
+_bn_div_words:
+L_bn_div_words_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	movl	12(%esp),%ecx
+	divl	%ecx
+	ret
+.globl	_bn_add_words
+.private_extern	_bn_add_words
+.align	4
+_bn_add_words:
+L_bn_add_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	L006aw_finish
+L007aw_loop:
+	# Round 0 
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	# Round 1 
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+	# Round 2 
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+	# Round 3 
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+	# Round 4 
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+	# Round 5 
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+	# Round 6 
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+	# Round 7 
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	L007aw_loop
+L006aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	L008aw_end
+	# Tail Round 0 
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	L008aw_end
+	# Tail Round 1 
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	L008aw_end
+	# Tail Round 2 
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	L008aw_end
+	# Tail Round 3 
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	L008aw_end
+	# Tail Round 4 
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	L008aw_end
+	# Tail Round 5 
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	L008aw_end
+	# Tail Round 6 
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+L008aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_bn_sub_words
+.private_extern	_bn_sub_words
+.align	4
+_bn_sub_words:
+L_bn_sub_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	L009aw_finish
+L010aw_loop:
+	# Round 0 
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	# Round 1 
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+	# Round 2 
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+	# Round 3 
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+	# Round 4 
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+	# Round 5 
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+	# Round 6 
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+	# Round 7 
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	L010aw_loop
+L009aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	L011aw_end
+	# Tail Round 0 
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	L011aw_end
+	# Tail Round 1 
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	L011aw_end
+	# Tail Round 2 
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	L011aw_end
+	# Tail Round 3 
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	L011aw_end
+	# Tail Round 4 
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	L011aw_end
+	# Tail Round 5 
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	L011aw_end
+	# Tail Round 6 
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+L011aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/bn-586-linux.S
similarity index 53%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/bcm/bn-586-linux.S
index ff3c9b61..43edba09 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-586-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/bn-586-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -13,20 +12,14 @@
 .align	16
 bn_mul_add_words:
 .L_bn_mul_add_words_begin:
-	call	.L000PIC_me_up
-.L000PIC_me_up:
-	popl	%eax
-	leal	OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	.L001maw_non_sse2
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	12(%esp),%ecx
 	movd	16(%esp),%mm0
 	pxor	%mm1,%mm1
-	jmp	.L002maw_sse2_entry
+	jmp	.L000maw_sse2_entry
 .align	16
-.L003maw_sse2_unrolled:
+.L001maw_sse2_unrolled:
 	movd	(%eax),%mm3
 	paddq	%mm3,%mm1
 	movd	(%edx),%mm2
@@ -86,12 +79,12 @@ bn_mul_add_words:
 	leal	32(%eax),%eax
 	psrlq	$32,%mm1
 	subl	$8,%ecx
-	jz	.L004maw_sse2_exit
-.L002maw_sse2_entry:
+	jz	.L002maw_sse2_exit
+.L000maw_sse2_entry:
 	testl	$4294967288,%ecx
-	jnz	.L003maw_sse2_unrolled
+	jnz	.L001maw_sse2_unrolled
 .align	4
-.L005maw_sse2_loop:
+.L003maw_sse2_loop:
 	movd	(%edx),%mm2
 	movd	(%eax),%mm3
 	pmuludq	%mm0,%mm2
@@ -102,189 +95,11 @@ bn_mul_add_words:
 	subl	$1,%ecx
 	psrlq	$32,%mm1
 	leal	4(%eax),%eax
-	jnz	.L005maw_sse2_loop
-.L004maw_sse2_exit:
+	jnz	.L003maw_sse2_loop
+.L002maw_sse2_exit:
 	movd	%mm1,%eax
 	emms
 	ret
-.align	16
-.L001maw_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	xorl	%esi,%esi
-	movl	20(%esp),%edi
-	movl	28(%esp),%ecx
-	movl	24(%esp),%ebx
-	andl	$4294967288,%ecx
-	movl	32(%esp),%ebp
-	pushl	%ecx
-	jz	.L006maw_finish
-.align	16
-.L007maw_loop:
-
-	movl	(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-
-	movl	4(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	4(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-
-	movl	8(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	8(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-
-	movl	12(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	12(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-
-	movl	16(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	16(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-
-	movl	20(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	20(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-
-	movl	24(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	24(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-
-	movl	28(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	28(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,28(%edi)
-	movl	%edx,%esi
-
-	subl	$8,%ecx
-	leal	32(%ebx),%ebx
-	leal	32(%edi),%edi
-	jnz	.L007maw_loop
-.L006maw_finish:
-	movl	32(%esp),%ecx
-	andl	$7,%ecx
-	jnz	.L008maw_finish2
-	jmp	.L009maw_end
-.L008maw_finish2:
-
-	movl	(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	4(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	4(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	8(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	8(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	12(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	12(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	16(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	16(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	20(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	20(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	24(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	24(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-.L009maw_end:
-	movl	%esi,%eax
-	popl	%ecx
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -297,19 +112,13 @@ bn_mul_add_words:
 .align	16
 bn_mul_words:
 .L_bn_mul_words_begin:
-	call	.L010PIC_me_up
-.L010PIC_me_up:
-	popl	%eax
-	leal	OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	.L011mw_non_sse2
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	12(%esp),%ecx
 	movd	16(%esp),%mm0
 	pxor	%mm1,%mm1
 .align	16
-.L012mw_sse2_loop:
+.L004mw_sse2_loop:
 	movd	(%edx),%mm2
 	pmuludq	%mm0,%mm2
 	leal	4(%edx),%edx
@@ -318,156 +127,10 @@ bn_mul_words:
 	subl	$1,%ecx
 	psrlq	$32,%mm1
 	leal	4(%eax),%eax
-	jnz	.L012mw_sse2_loop
+	jnz	.L004mw_sse2_loop
 	movd	%mm1,%eax
 	emms
 	ret
-.align	16
-.L011mw_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	xorl	%esi,%esi
-	movl	20(%esp),%edi
-	movl	24(%esp),%ebx
-	movl	28(%esp),%ebp
-	movl	32(%esp),%ecx
-	andl	$4294967288,%ebp
-	jz	.L013mw_finish
-.L014mw_loop:
-
-	movl	(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-
-	movl	4(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-
-	movl	8(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-
-	movl	12(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-
-	movl	16(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-
-	movl	20(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-
-	movl	24(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-
-	movl	28(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,28(%edi)
-	movl	%edx,%esi
-
-	addl	$32,%ebx
-	addl	$32,%edi
-	subl	$8,%ebp
-	jz	.L013mw_finish
-	jmp	.L014mw_loop
-.L013mw_finish:
-	movl	28(%esp),%ebp
-	andl	$7,%ebp
-	jnz	.L015mw_finish2
-	jmp	.L016mw_end
-.L015mw_finish2:
-
-	movl	(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	4(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	8(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	12(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	16(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	20(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	24(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-.L016mw_end:
-	movl	%esi,%eax
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -480,136 +143,20 @@ bn_mul_words:
 .align	16
 bn_sqr_words:
 .L_bn_sqr_words_begin:
-	call	.L017PIC_me_up
-.L017PIC_me_up:
-	popl	%eax
-	leal	OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	.L018sqr_non_sse2
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	12(%esp),%ecx
 .align	16
-.L019sqr_sse2_loop:
+.L005sqr_sse2_loop:
 	movd	(%edx),%mm0
 	pmuludq	%mm0,%mm0
 	leal	4(%edx),%edx
 	movq	%mm0,(%eax)
 	subl	$1,%ecx
 	leal	8(%eax),%eax
-	jnz	.L019sqr_sse2_loop
+	jnz	.L005sqr_sse2_loop
 	emms
 	ret
-.align	16
-.L018sqr_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%ebx
-	andl	$4294967288,%ebx
-	jz	.L020sw_finish
-.L021sw_loop:
-
-	movl	(%edi),%eax
-	mull	%eax
-	movl	%eax,(%esi)
-	movl	%edx,4(%esi)
-
-	movl	4(%edi),%eax
-	mull	%eax
-	movl	%eax,8(%esi)
-	movl	%edx,12(%esi)
-
-	movl	8(%edi),%eax
-	mull	%eax
-	movl	%eax,16(%esi)
-	movl	%edx,20(%esi)
-
-	movl	12(%edi),%eax
-	mull	%eax
-	movl	%eax,24(%esi)
-	movl	%edx,28(%esi)
-
-	movl	16(%edi),%eax
-	mull	%eax
-	movl	%eax,32(%esi)
-	movl	%edx,36(%esi)
-
-	movl	20(%edi),%eax
-	mull	%eax
-	movl	%eax,40(%esi)
-	movl	%edx,44(%esi)
-
-	movl	24(%edi),%eax
-	mull	%eax
-	movl	%eax,48(%esi)
-	movl	%edx,52(%esi)
-
-	movl	28(%edi),%eax
-	mull	%eax
-	movl	%eax,56(%esi)
-	movl	%edx,60(%esi)
-
-	addl	$32,%edi
-	addl	$64,%esi
-	subl	$8,%ebx
-	jnz	.L021sw_loop
-.L020sw_finish:
-	movl	28(%esp),%ebx
-	andl	$7,%ebx
-	jz	.L022sw_end
-
-	movl	(%edi),%eax
-	mull	%eax
-	movl	%eax,(%esi)
-	decl	%ebx
-	movl	%edx,4(%esi)
-	jz	.L022sw_end
-
-	movl	4(%edi),%eax
-	mull	%eax
-	movl	%eax,8(%esi)
-	decl	%ebx
-	movl	%edx,12(%esi)
-	jz	.L022sw_end
-
-	movl	8(%edi),%eax
-	mull	%eax
-	movl	%eax,16(%esi)
-	decl	%ebx
-	movl	%edx,20(%esi)
-	jz	.L022sw_end
-
-	movl	12(%edi),%eax
-	mull	%eax
-	movl	%eax,24(%esi)
-	decl	%ebx
-	movl	%edx,28(%esi)
-	jz	.L022sw_end
-
-	movl	16(%edi),%eax
-	mull	%eax
-	movl	%eax,32(%esi)
-	decl	%ebx
-	movl	%edx,36(%esi)
-	jz	.L022sw_end
-
-	movl	20(%edi),%eax
-	mull	%eax
-	movl	%eax,40(%esi)
-	decl	%ebx
-	movl	%edx,44(%esi)
-	jz	.L022sw_end
-
-	movl	24(%edi),%eax
-	mull	%eax
-	movl	%eax,48(%esi)
-	movl	%edx,52(%esi)
-.L022sw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -645,8 +192,8 @@ bn_add_words:
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	.L023aw_finish
-.L024aw_loop:
+	jz	.L006aw_finish
+.L007aw_loop:
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -724,11 +271,11 @@ bn_add_words:
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L024aw_loop
-.L023aw_finish:
+	jnz	.L007aw_loop
+.L006aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -739,7 +286,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	4(%esi),%ecx
 	movl	4(%edi),%edx
@@ -750,7 +297,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	8(%esi),%ecx
 	movl	8(%edi),%edx
@@ -761,7 +308,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	12(%esi),%ecx
 	movl	12(%edi),%edx
@@ -772,7 +319,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	16(%esi),%ecx
 	movl	16(%edi),%edx
@@ -783,7 +330,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	20(%esi),%ecx
 	movl	20(%edi),%edx
@@ -794,7 +341,7 @@ bn_add_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	24(%esi),%ecx
 	movl	24(%edi),%edx
@@ -804,7 +351,7 @@ bn_add_words:
 	addl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-.L025aw_end:
+.L008aw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -828,8 +375,8 @@ bn_sub_words:
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	.L026aw_finish
-.L027aw_loop:
+	jz	.L009aw_finish
+.L010aw_loop:
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -907,11 +454,11 @@ bn_sub_words:
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L027aw_loop
-.L026aw_finish:
+	jnz	.L010aw_loop
+.L009aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -922,7 +469,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	4(%esi),%ecx
 	movl	4(%edi),%edx
@@ -933,7 +480,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	8(%esi),%ecx
 	movl	8(%edi),%edx
@@ -944,7 +491,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	12(%esi),%ecx
 	movl	12(%edi),%edx
@@ -955,7 +502,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	16(%esi),%ecx
 	movl	16(%edi),%edx
@@ -966,7 +513,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	20(%esi),%ecx
 	movl	20(%edi),%edx
@@ -977,7 +524,7 @@ bn_sub_words:
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	24(%esi),%ecx
 	movl	24(%edi),%edx
@@ -987,7 +534,7 @@ bn_sub_words:
 	subl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-.L028aw_end:
+.L011aw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -995,7 +542,6 @@ bn_sub_words:
 	ret
 .size	bn_sub_words,.-.L_bn_sub_words_begin
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-apple.S
similarity index 95%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-apple.S
index 4efdfa89..66e8c837 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -89,7 +88,6 @@ Lsub_exit:
 	ret
 
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-linux.S
similarity index 95%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-linux.S
index 14d9eeee..9ea9ad7d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bn-armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -89,7 +88,6 @@ bn_sub_words:
 	ret
 .size	bn_sub_words,.-bn_sub_words
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-win.S
new file mode 100644
index 00000000..6c7a6a94
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/bn-armv8-win.S
@@ -0,0 +1,94 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <CCryptoBoringSSL_arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+
+.globl	bn_add_words
+
+.align	4
+bn_add_words:
+	AARCH64_VALID_CALL_TARGET
+	# Clear the carry flag.
+	cmn	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, Ladd_tail
+Ladd_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	adcs	x4, x4, x6
+	adcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, Ladd_loop
+
+Ladd_tail:
+	cbz	x3, Ladd_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	adcs	x4, x4, x6
+	str	x4, [x0], #8
+
+Ladd_exit:
+	cset	x0, cs
+	ret
+
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+
+.globl	bn_sub_words
+
+.align	4
+bn_sub_words:
+	AARCH64_VALID_CALL_TARGET
+	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+	# so we want C = 1 here.
+	cmp	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, Lsub_tail
+Lsub_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	sbcs	x4, x4, x6
+	sbcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, Lsub_loop
+
+Lsub_tail:
+	cbz	x3, Lsub_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	sbcs	x4, x4, x6
+	str	x4, [x0], #8
+
+Lsub_exit:
+	cset	x0, cc
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/bsaes-armv7-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-linux.linux.arm.S
rename to Sources/CCryptoBoringSSL/gen/bcm/bsaes-armv7-linux.S
index 5f512cc3..5d85cdfb 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/bsaes-armv7-linux.linux.arm.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/bsaes-armv7-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1517,7 +1516,6 @@ bsaes_ctr32_encrypt_blocks:
 .size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
-#endif  // defined(__arm__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/co-586-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/co-586-apple.S
new file mode 100644
index 00000000..935cc5ed
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/co-586-apple.S
@@ -0,0 +1,1261 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_bn_mul_comba8
+.private_extern	_bn_mul_comba8
+.align	4
+_bn_mul_comba8:
+L_bn_mul_comba8_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+	# ################## Calculate word 0 
+	xorl	%ebp,%ebp
+	# mul a[0]*b[0] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+	# saved r[0] 
+	# ################## Calculate word 1 
+	xorl	%ebx,%ebx
+	# mul a[1]*b[0] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[0]*b[1] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+	# saved r[1] 
+	# ################## Calculate word 2 
+	xorl	%ecx,%ecx
+	# mul a[2]*b[0] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[1]*b[1] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[0]*b[2] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+	# saved r[2] 
+	# ################## Calculate word 3 
+	xorl	%ebp,%ebp
+	# mul a[3]*b[0] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[2]*b[1] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[1]*b[2] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[0]*b[3] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	16(%esi),%eax
+	# saved r[3] 
+	# ################## Calculate word 4 
+	xorl	%ebx,%ebx
+	# mul a[4]*b[0] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[3]*b[1] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[2]*b[2] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[1]*b[3] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[0]*b[4] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	20(%esi),%eax
+	# saved r[4] 
+	# ################## Calculate word 5 
+	xorl	%ecx,%ecx
+	# mul a[5]*b[0] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[4]*b[1] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[3]*b[2] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[2]*b[3] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[1]*b[4] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[0]*b[5] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	24(%esi),%eax
+	# saved r[5] 
+	# ################## Calculate word 6 
+	xorl	%ebp,%ebp
+	# mul a[6]*b[0] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[5]*b[1] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[4]*b[2] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[3]*b[3] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[2]*b[4] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[1]*b[5] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[0]*b[6] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+	movl	28(%esi),%eax
+	# saved r[6] 
+	# ################## Calculate word 7 
+	xorl	%ebx,%ebx
+	# mul a[7]*b[0] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[6]*b[1] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[5]*b[2] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[4]*b[3] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[3]*b[4] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[2]*b[5] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[1]*b[6] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[0]*b[7] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,28(%eax)
+	movl	28(%esi),%eax
+	# saved r[7] 
+	# ################## Calculate word 8 
+	xorl	%ecx,%ecx
+	# mul a[7]*b[1] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[6]*b[2] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[5]*b[3] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[4]*b[4] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[3]*b[5] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[2]*b[6] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[1]*b[7] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%eax)
+	movl	28(%esi),%eax
+	# saved r[8] 
+	# ################## Calculate word 9 
+	xorl	%ebp,%ebp
+	# mul a[7]*b[2] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[6]*b[3] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[5]*b[4] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[4]*b[5] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[3]*b[6] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[2]*b[7] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,36(%eax)
+	movl	28(%esi),%eax
+	# saved r[9] 
+	# ################## Calculate word 10 
+	xorl	%ebx,%ebx
+	# mul a[7]*b[3] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[6]*b[4] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[5]*b[5] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[4]*b[6] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[3]*b[7] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%eax)
+	movl	28(%esi),%eax
+	# saved r[10] 
+	# ################## Calculate word 11 
+	xorl	%ecx,%ecx
+	# mul a[7]*b[4] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[6]*b[5] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[5]*b[6] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[4]*b[7] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,44(%eax)
+	movl	28(%esi),%eax
+	# saved r[11] 
+	# ################## Calculate word 12 
+	xorl	%ebp,%ebp
+	# mul a[7]*b[5] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[6]*b[6] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[5]*b[7] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%eax)
+	movl	28(%esi),%eax
+	# saved r[12] 
+	# ################## Calculate word 13 
+	xorl	%ebx,%ebx
+	# mul a[7]*b[6] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[6]*b[7] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,52(%eax)
+	movl	28(%esi),%eax
+	# saved r[13] 
+	# ################## Calculate word 14 
+	xorl	%ecx,%ecx
+	# mul a[7]*b[7] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%eax)
+	# saved r[14] 
+	# save r[15] 
+	movl	%ebx,60(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.globl	_bn_mul_comba4
+.private_extern	_bn_mul_comba4
+.align	4
+_bn_mul_comba4:
+L_bn_mul_comba4_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+	# ################## Calculate word 0 
+	xorl	%ebp,%ebp
+	# mul a[0]*b[0] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+	# saved r[0] 
+	# ################## Calculate word 1 
+	xorl	%ebx,%ebx
+	# mul a[1]*b[0] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[0]*b[1] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+	# saved r[1] 
+	# ################## Calculate word 2 
+	xorl	%ecx,%ecx
+	# mul a[2]*b[0] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[1]*b[1] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[0]*b[2] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+	# saved r[2] 
+	# ################## Calculate word 3 
+	xorl	%ebp,%ebp
+	# mul a[3]*b[0] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[2]*b[1] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[1]*b[2] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	# mul a[0]*b[3] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	12(%esi),%eax
+	# saved r[3] 
+	# ################## Calculate word 4 
+	xorl	%ebx,%ebx
+	# mul a[3]*b[1] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[2]*b[2] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+	# mul a[1]*b[3] 
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	12(%esi),%eax
+	# saved r[4] 
+	# ################## Calculate word 5 
+	xorl	%ecx,%ecx
+	# mul a[3]*b[2] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	# mul a[2]*b[3] 
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	12(%esi),%eax
+	# saved r[5] 
+	# ################## Calculate word 6 
+	xorl	%ebp,%ebp
+	# mul a[3]*b[3] 
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+	# saved r[6] 
+	# save r[7] 
+	movl	%ecx,28(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.globl	_bn_sqr_comba8
+.private_extern	_bn_sqr_comba8
+.align	4
+_bn_sqr_comba8:
+L_bn_sqr_comba8_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+	# ############### Calculate word 0 
+	xorl	%ebp,%ebp
+	# sqr a[0]*a[0] 
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+	# saved r[0] 
+	# ############### Calculate word 1 
+	xorl	%ebx,%ebx
+	# sqr a[1]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+	# saved r[1] 
+	# ############### Calculate word 2 
+	xorl	%ecx,%ecx
+	# sqr a[2]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+	# sqr a[1]*a[1] 
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+	# saved r[2] 
+	# ############### Calculate word 3 
+	xorl	%ebp,%ebp
+	# sqr a[3]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+	# sqr a[2]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	(%esi),%edx
+	# saved r[3] 
+	# ############### Calculate word 4 
+	xorl	%ebx,%ebx
+	# sqr a[4]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	12(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+	# sqr a[3]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	# sqr a[2]*a[2] 
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	20(%esi),%eax
+	# saved r[4] 
+	# ############### Calculate word 5 
+	xorl	%ecx,%ecx
+	# sqr a[5]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+	movl	4(%esi),%edx
+	# sqr a[4]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+	# sqr a[3]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+	movl	(%esi),%edx
+	# saved r[5] 
+	# ############### Calculate word 6 
+	xorl	%ebp,%ebp
+	# sqr a[6]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+	# sqr a[5]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	8(%esi),%edx
+	# sqr a[4]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+	# sqr a[3]*a[3] 
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+	movl	28(%esi),%eax
+	# saved r[6] 
+	# ############### Calculate word 7 
+	xorl	%ebx,%ebx
+	# sqr a[7]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+	# sqr a[6]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+	movl	8(%esi),%edx
+	# sqr a[5]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%eax
+	adcl	$0,%ebx
+	movl	12(%esi),%edx
+	# sqr a[4]*a[3] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,28(%edi)
+	movl	4(%esi),%edx
+	# saved r[7] 
+	# ############### Calculate word 8 
+	xorl	%ecx,%ecx
+	# sqr a[7]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+	# sqr a[6]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	20(%esi),%eax
+	adcl	$0,%ecx
+	movl	12(%esi),%edx
+	# sqr a[5]*a[3] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+	# sqr a[4]*a[4] 
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	8(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%edi)
+	movl	28(%esi),%eax
+	# saved r[8] 
+	# ############### Calculate word 9 
+	xorl	%ebp,%ebp
+	# sqr a[7]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+	movl	12(%esi),%edx
+	# sqr a[6]*a[3] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	16(%esi),%edx
+	# sqr a[5]*a[4] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	28(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,36(%edi)
+	movl	12(%esi),%edx
+	# saved r[9] 
+	# ############### Calculate word 10 
+	xorl	%ebx,%ebx
+	# sqr a[7]*a[3] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	16(%esi),%edx
+	# sqr a[6]*a[4] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+	# sqr a[5]*a[5] 
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%edi)
+	movl	28(%esi),%eax
+	# saved r[10] 
+	# ############### Calculate word 11 
+	xorl	%ecx,%ecx
+	# sqr a[7]*a[4] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	20(%esi),%edx
+	# sqr a[6]*a[5] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	28(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,44(%edi)
+	movl	20(%esi),%edx
+	# saved r[11] 
+	# ############### Calculate word 12 
+	xorl	%ebp,%ebp
+	# sqr a[7]*a[5] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+	# sqr a[6]*a[6] 
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%edi)
+	movl	28(%esi),%eax
+	# saved r[12] 
+	# ############### Calculate word 13 
+	xorl	%ebx,%ebx
+	# sqr a[7]*a[6] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,52(%edi)
+	# saved r[13] 
+	# ############### Calculate word 14 
+	xorl	%ecx,%ecx
+	# sqr a[7]*a[7] 
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%edi)
+	# saved r[14] 
+	movl	%ebx,60(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.globl	_bn_sqr_comba4
+.private_extern	_bn_sqr_comba4
+.align	4
+_bn_sqr_comba4:
+L_bn_sqr_comba4_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+	# ############### Calculate word 0 
+	xorl	%ebp,%ebp
+	# sqr a[0]*a[0] 
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+	# saved r[0] 
+	# ############### Calculate word 1 
+	xorl	%ebx,%ebx
+	# sqr a[1]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+	# saved r[1] 
+	# ############### Calculate word 2 
+	xorl	%ecx,%ecx
+	# sqr a[2]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+	# sqr a[1]*a[1] 
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+	# saved r[2] 
+	# ############### Calculate word 3 
+	xorl	%ebp,%ebp
+	# sqr a[3]*a[0] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+	# sqr a[2]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	4(%esi),%edx
+	# saved r[3] 
+	# ############### Calculate word 4 
+	xorl	%ebx,%ebx
+	# sqr a[3]*a[1] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	# sqr a[2]*a[2] 
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	12(%esi),%eax
+	# saved r[4] 
+	# ############### Calculate word 5 
+	xorl	%ecx,%ecx
+	# sqr a[3]*a[2] 
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+	# saved r[5] 
+	# ############### Calculate word 6 
+	xorl	%ebp,%ebp
+	# sqr a[3]*a[3] 
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+	# saved r[6] 
+	movl	%ecx,28(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/co-586-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/bcm/co-586-linux.S
index 4c61741a..486b1ce7 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/co-586-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/co-586-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1264,7 +1263,6 @@ bn_sqr_comba4:
 	ret
 .size	bn_sqr_comba4,.-.L_bn_sqr_comba4_begin
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-armv4-linux.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-linux.linux.arm.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-armv4-linux.S
index 33cb9b7c..4c2cddd6 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-armv4-linux.linux.arm.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-armv4-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -244,7 +243,6 @@ gcm_ghash_neon:
 .align	2
 .align	2
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
-#endif  // defined(__arm__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-apple.S
index a1fa2e04..41de5a36 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -335,7 +334,6 @@ Lmasks:
 .align	2
 .align	2
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-linux.S
index fd4185bd..449962d4 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-neon-armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -335,7 +334,6 @@ gcm_ghash_neon:
 .align	2
 .align	2
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-win.S
new file mode 100644
index 00000000..996cf34d
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-neon-armv8-win.S
@@ -0,0 +1,346 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <CCryptoBoringSSL_arm_arch.h>
+
+.text
+
+.globl	gcm_init_neon
+
+.def gcm_init_neon
+   .type 32
+.endef
+.align	4
+gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+
+
+.globl	gcm_gmult_neon
+
+.def gcm_gmult_neon
+   .type 32
+.endef
+.align	4
+gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks		// load constants
+	add	x9, x9, :lo12:Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	Lgmult_neon
+
+
+.globl	gcm_ghash_neon
+
+.def gcm_ghash_neon
+   .type 32
+.endef
+.align	4
+gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks		// load constants
+	add	x9, x9, :lo12:Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+
+
+.section	.rodata
+.align	4
+Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-apple.S
new file mode 100644
index 00000000..5f016a29
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-apple.S
@@ -0,0 +1,293 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_gcm_gmult_ssse3
+.private_extern	_gcm_gmult_ssse3
+.align	4
+_gcm_gmult_ssse3:
+L_gcm_gmult_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movdqu	(%edi),%xmm0
+	call	L000pic_point
+L000pic_point:
+	popl	%eax
+	movdqa	Lreverse_bytes-L000pic_point(%eax),%xmm7
+	movdqa	Llow4_mask-L000pic_point(%eax),%xmm2
+.byte	102,15,56,0,199
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+L001loop_row_1:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L001loop_row_1
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+L002loop_row_2:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L002loop_row_2
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$6,%eax
+L003loop_row_3:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L003loop_row_3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,0,215
+	movdqu	%xmm2,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_gcm_ghash_ssse3
+.private_extern	_gcm_ghash_ssse3
+.align	4
+_gcm_ghash_ssse3:
+L_gcm_ghash_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movl	28(%esp),%edx
+	movl	32(%esp),%ecx
+	movdqu	(%edi),%xmm0
+	call	L004pic_point
+L004pic_point:
+	popl	%ebx
+	movdqa	Lreverse_bytes-L004pic_point(%ebx),%xmm7
+	andl	$-16,%ecx
+.byte	102,15,56,0,199
+	pxor	%xmm3,%xmm3
+L005loop_ghash:
+	movdqa	Llow4_mask-L004pic_point(%ebx),%xmm2
+	movdqu	(%edx),%xmm1
+.byte	102,15,56,0,207
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	pxor	%xmm2,%xmm2
+	movl	$5,%eax
+L006loop_row_4:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L006loop_row_4
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+L007loop_row_5:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L007loop_row_5
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$6,%eax
+L008loop_row_6:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	L008loop_row_6
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm2,%xmm0
+	leal	-256(%esi),%esi
+	leal	16(%edx),%edx
+	subl	$16,%ecx
+	jnz	L005loop_ghash
+.byte	102,15,56,0,199
+	movdqu	%xmm0,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	4,0x90
+Lreverse_bytes:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.align	4,0x90
+Llow4_mask:
+.long	252645135,252645135,252645135,252645135
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-linux.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-linux.S
index cfa55517..dc28a922 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -292,7 +291,6 @@ gcm_ghash_ssse3:
 .Llow4_mask:
 .long	252645135,252645135,252645135,252645135
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-apple.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-apple.S
index a334b97d..b53ae17c 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -423,7 +422,6 @@ L$low4_mask:
 .quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-linux.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-linux.S
index f16865c0..5967f1b2 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-ssse3-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -423,7 +422,6 @@ _CET_ENDBR
 .quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-apple.S
new file mode 100644
index 00000000..25dfc3c7
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-apple.S
@@ -0,0 +1,327 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_gcm_init_clmul
+.private_extern	_gcm_init_clmul
+.align	4
+_gcm_init_clmul:
+L_gcm_init_clmul_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	call	L000pic
+L000pic:
+	popl	%ecx
+	leal	Lbswap-L000pic(%ecx),%ecx
+	movdqu	(%eax),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+	pand	16(%ecx),%xmm5
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,(%edx)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%edx)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%edx)
+	ret
+.globl	_gcm_gmult_clmul
+.private_extern	_gcm_gmult_clmul
+.align	4
+_gcm_gmult_clmul:
+L_gcm_gmult_clmul_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	call	L001pic
+L001pic:
+	popl	%ecx
+	leal	Lbswap-L001pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movups	(%edx),%xmm2
+.byte	102,15,56,0,197
+	movups	32(%edx),%xmm4
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	ret
+.globl	_gcm_ghash_clmul
+.private_extern	_gcm_ghash_clmul
+.align	4
+_gcm_ghash_clmul:
+L_gcm_ghash_clmul_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%eax
+	movl	24(%esp),%edx
+	movl	28(%esp),%esi
+	movl	32(%esp),%ebx
+	call	L002pic
+L002pic:
+	popl	%ecx
+	leal	Lbswap-L002pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movdqu	(%edx),%xmm2
+.byte	102,15,56,0,197
+	subl	$16,%ebx
+	jz	L003odd_tail
+	movdqu	(%esi),%xmm3
+	movdqu	16(%esi),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	movdqu	32(%edx),%xmm5
+	pxor	%xmm3,%xmm0
+	pshufd	$78,%xmm6,%xmm3
+	movdqa	%xmm6,%xmm7
+	pxor	%xmm6,%xmm3
+	leal	32(%esi),%esi
+.byte	102,15,58,68,242,0
+.byte	102,15,58,68,250,17
+.byte	102,15,58,68,221,0
+	movups	16(%edx),%xmm2
+	nop
+	subl	$32,%ebx
+	jbe	L004even_tail
+	jmp	L005mod_loop
+.align	5,0x90
+L005mod_loop:
+	pshufd	$78,%xmm0,%xmm4
+	movdqa	%xmm0,%xmm1
+	pxor	%xmm0,%xmm4
+	nop
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,229,16
+	movups	(%edx),%xmm2
+	xorps	%xmm6,%xmm0
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm7,%xmm1
+	movdqu	(%esi),%xmm7
+	pxor	%xmm0,%xmm3
+	movdqu	16(%esi),%xmm6
+	pxor	%xmm1,%xmm3
+.byte	102,15,56,0,253
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+.byte	102,15,56,0,245
+	pxor	%xmm7,%xmm1
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+.byte	102,15,58,68,242,0
+	movups	32(%edx),%xmm5
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	pshufd	$78,%xmm7,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm7,%xmm3
+	pxor	%xmm4,%xmm1
+.byte	102,15,58,68,250,17
+	movups	16(%edx),%xmm2
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,58,68,221,0
+	leal	32(%esi),%esi
+	subl	$32,%ebx
+	ja	L005mod_loop
+L004even_tail:
+	pshufd	$78,%xmm0,%xmm4
+	movdqa	%xmm0,%xmm1
+	pxor	%xmm0,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,229,16
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm6,%xmm0
+	xorps	%xmm7,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	testl	%ebx,%ebx
+	jnz	L006done
+	movups	(%edx),%xmm2
+L003odd_tail:
+	movdqu	(%esi),%xmm3
+.byte	102,15,56,0,221
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+L006done:
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	6,0x90
+Lbswap:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte	0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-linux.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-linux.S
index 1acf76aa..2c54b18e 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -328,7 +327,6 @@ gcm_ghash_clmul:
 .byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
 .byte	0
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-apple.S
index bd76e53b..310c052a 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -614,6 +613,7 @@ L$done:
 .p2align	5
 _gcm_init_avx:
 
+
 _CET_ENDBR
 	vzeroupper
 
@@ -736,6 +736,7 @@ _CET_ENDBR
 .p2align	5
 _gcm_ghash_avx:
 
+
 _CET_ENDBR
 	vzeroupper
 
@@ -1125,7 +1126,6 @@ L$7_mask:
 .p2align	6
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-linux.S
index 8f473262..4cfc5b30 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghash-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghash-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -614,6 +613,7 @@ _CET_ENDBR
 .align	32
 gcm_init_avx:
 .cfi_startproc	
+
 _CET_ENDBR
 	vzeroupper
 
@@ -736,6 +736,7 @@ _CET_ENDBR
 .align	32
 gcm_ghash_avx:
 .cfi_startproc	
+
 _CET_ENDBR
 	vzeroupper
 
@@ -1125,7 +1126,6 @@ _CET_ENDBR
 .align	64
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv7-linux.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-linux.linux.arm.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv7-linux.S
index 2d9c60b5..f552f576 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv7-linux.linux.arm.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv7-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -246,7 +245,6 @@ gcm_ghash_v8:
 .align	2
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
-#endif  // defined(__arm__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-apple.S
index 0e7c3c4a..467df8f2 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -565,7 +564,6 @@ Ldone4x:
 .align	2
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-linux.S
index 6f861858..edfda39a 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/ghashv8-armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -565,7 +564,6 @@ gcm_ghash_v8_4x:
 .align	2
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-win.S
new file mode 100644
index 00000000..d0b9a02c
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/ghashv8-armv8-win.S
@@ -0,0 +1,578 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <CCryptoBoringSSL_arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.globl	gcm_init_v8
+
+.def gcm_init_v8
+   .type 32
+.endef
+.align	4
+gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v22.16b,v0.16b,v18.16b
+
+	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
+	ret
+
+.globl	gcm_gmult_v8
+
+.def gcm_gmult_v8
+   .type 32
+.endef
+.align	4
+gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.globl	gcm_ghash_v8
+
+.def gcm_ghash_v8
+   .type 32
+.endef
+.align	4
+gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	cmp	x3,#64
+	b.hs	Lgcm_ghash_v8_4x
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+						//"[rotated]" means that
+						//loaded value would have
+						//to be rotated in order to
+						//make it appear as in
+						//algorithm specification
+	subs	x3,x3,#32		//see if x3 is 32 or larger
+	mov	x12,#16		//x12 is used as post-
+						//increment for input pointer;
+						//as loop is modulo-scheduled
+						//x12 is zeroed just in time
+						//to preclude overstepping
+						//inp[len], which means that
+						//last block[s] are actually
+						//loaded twice, but last
+						//copy is not processed
+	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v22.2d},[x1]
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
+	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
+	b.lo	Lodd_tail_v8		//x3 was less than 32
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v7.16b,v17.16b,v17.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	pmull2	v6.1q,v20.2d,v7.2d
+	b	Loop_mod2x_v8
+
+.align	4
+Loop_mod2x_v8:
+	ext	v18.16b,v3.16b,v3.16b,#8
+	subs	x3,x3,#32		//is there more data?
+	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
+	csel	x12,xzr,x12,lo			//is it time to zero x12?
+
+	pmull	v5.1q,v21.1d,v17.1d
+	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
+	eor	v0.16b,v0.16b,v4.16b		//accumulate
+	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
+
+	eor	v2.16b,v2.16b,v6.16b
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	eor	v1.16b,v1.16b,v5.16b
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+#endif
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v7.16b,v17.16b,v17.16b,#8
+	ext	v3.16b,v16.16b,v16.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v3.16b,v3.16b,v18.16b
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	eor	v3.16b,v3.16b,v0.16b
+	pmull2	v6.1q,v20.2d,v7.2d
+	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
+
+	eor	v2.16b,v2.16b,v18.16b
+	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
+	adds	x3,x3,#32		//re-construct x3
+	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
+	b.eq	Ldone_v8		//is x3 zero?
+Lodd_tail_v8:
+	ext	v18.16b,v0.16b,v0.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
+	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+Ldone_v8:
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.def gcm_ghash_v8_4x
+   .type 32
+.endef
+.align	4
+gcm_ghash_v8_4x:
+Lgcm_ghash_v8_4x:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+	ext	v25.16b,v7.16b,v7.16b,#8
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#128
+	b.lo	Ltail4x
+
+	b	Loop4x
+
+.align	4
+Loop4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+	ext	v3.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	ext	v25.16b,v7.16b,v7.16b,#8
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	ext	v24.16b,v6.16b,v6.16b,#8
+	eor	v1.16b,v1.16b,v30.16b
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	eor	v1.16b,v1.16b,v17.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	eor	v0.16b,v1.16b,v18.16b
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v0.16b,v0.16b,v18.16b
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#64
+	b.hs	Loop4x
+
+Ltail4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+
+	adds	x3,x3,#64
+	b.eq	Ldone4x
+
+	cmp	x3,#32
+	b.lo	Lone
+	b.eq	Ltwo
+Lthree:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d,v6.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v31.1q,v20.2d,v24.2d
+	pmull	v30.1q,v21.1d,v6.1d
+	eor	v0.16b,v0.16b,v18.16b
+	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull2	v23.1q,v22.2d,v23.2d
+	eor	v16.16b,v4.16b,v0.16b
+	pmull2	v5.1q,v21.2d,v5.2d
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v26.2d,v3.2d
+	pmull	v1.1q,v27.1d,v16.1d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Ltwo:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull2	v31.1q,v20.2d,v23.2d
+	pmull	v30.1q,v21.1d,v5.1d
+
+	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v22.2d,v3.2d
+	pmull2	v1.1q,v21.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Lone:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v20.2d,v3.2d
+	pmull	v1.1q,v21.1d,v16.1d
+
+Ldone4x:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-apple.S
index 41f40276..6bc1b99a 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1726,7 +1725,6 @@ Lselect_w7_loop:
 	ret
 
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-linux.S
index a3a54045..8d0bcba5 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-armv8-asm-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1726,7 +1725,6 @@ ecp_nistz256_select_w7:
 	ret
 .size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-win.S b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-win.S
new file mode 100644
index 00000000..6a71ed78
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/p256-armv8-asm-win.S
@@ -0,0 +1,1771 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include "CCryptoBoringSSL_arm_arch.h"
+
+.section	.rodata
+.align	5
+Lpoly:
+.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
+.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+Lone_mont:
+.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+Lone:
+.quad	1,0,0,0
+Lord:
+.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+LordK:
+.quad	0xccd1c8aaee00bc4f
+.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.text
+
+// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+//					     const BN_ULONG x2[4]);
+.globl	ecp_nistz256_mul_mont
+
+.def ecp_nistz256_mul_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_sqr_mont
+
+.def ecp_nistz256_sqr_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_sqr_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sqr_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_div_by_2
+
+.def ecp_nistz256_div_by_2
+   .type 32
+.endef
+.align	4
+ecp_nistz256_div_by_2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_div_by_2
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_mul_by_2
+
+.def ecp_nistz256_mul_by_2
+   .type 32
+.endef
+.align	4
+ecp_nistz256_mul_by_2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+
+	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_mul_by_3
+
+.def ecp_nistz256_mul_by_3
+   .type 32
+.endef
+.align	4
+ecp_nistz256_mul_by_3:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+
+	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
+
+	mov	x8,x4
+	mov	x9,x5
+	mov	x10,x6
+	mov	x11,x7
+
+	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
+//				        const BN_ULONG x2[4]);
+.globl	ecp_nistz256_sub
+
+.def ecp_nistz256_sub
+   .type 32
+.endef
+.align	4
+ecp_nistz256_sub:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	x14,x15,[x1]
+	ldp	x16,x17,[x1,#16]
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_neg
+
+.def ecp_nistz256_neg
+   .type 32
+.endef
+.align	4
+ecp_nistz256_neg:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x2,x1
+	mov	x14,xzr		// a = 0
+	mov	x15,xzr
+	mov	x16,xzr
+	mov	x17,xzr
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to x4-x7 and b[0] - to x3
+.def __ecp_nistz256_mul_mont
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_mul_mont:
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x11,x7,x3
+	ldr	x3,[x2,#8]		// b[1]
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adc	x19,xzr,x11
+	mov	x20,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	mul	x8,x4,x3		// lo(a[0]*b[i])
+	adcs	x15,x16,x9
+	mul	x9,x5,x3		// lo(a[1]*b[i])
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	mul	x10,x6,x3		// lo(a[2]*b[i])
+	adcs	x17,x19,x11
+	mul	x11,x7,x3		// lo(a[3]*b[i])
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts of multiplication
+	umulh	x8,x4,x3		// hi(a[0]*b[i])
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3		// hi(a[1]*b[i])
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3		// hi(a[2]*b[i])
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3		// hi(a[3]*b[i])
+	adc	x19,x19,xzr
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	lsl	x8,x14,#32
+	adcs	x16,x16,x9
+	lsr	x9,x14,#32
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	// last reduction
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adcs	x17,x19,x11
+	adc	x19,x20,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to x4-x7
+.def __ecp_nistz256_sqr_mont
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_sqr_mont:
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x2,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	lsl	x8,x14,#32
+	adcs	x1,x1,x11
+	lsr	x9,x14,#32
+	adc	x2,x2,x7
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	lsl	x8,x14,#32
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	lsr	x9,x14,#32
+	adc	x17,x11,xzr		// can't overflow
+	subs	x10,x14,x8		// "*0xffff0001"
+	sbc	x11,x14,x9
+	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
+	adcs	x15,x16,x9
+	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
+	adc	x17,x11,xzr		// can't overflow
+
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x2
+	adc	x19,xzr,xzr
+
+	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x19,xzr		// did it borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// x4-x7 and x8-x11. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+.def __ecp_nistz256_add_to
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_add_to:
+	adds	x14,x14,x8		// ret = a+b
+	adcs	x15,x15,x9
+	adcs	x16,x16,x10
+	adcs	x17,x17,x11
+	adc	x1,xzr,xzr		// zap x1
+
+	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
+	sbcs	x9,x15,x12
+	sbcs	x10,x16,xzr
+	sbcs	x11,x17,x13
+	sbcs	xzr,x1,xzr		// did subtraction borrow?
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+.def __ecp_nistz256_sub_from
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_sub_from:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x14,x8		// ret = a-b
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbcs	x17,x17,x11
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+.def __ecp_nistz256_sub_morf
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_sub_morf:
+	ldp	x8,x9,[x2]
+	ldp	x10,x11,[x2,#16]
+	subs	x14,x8,x14		// ret = b-a
+	sbcs	x15,x9,x15
+	sbcs	x16,x10,x16
+	sbcs	x17,x11,x17
+	sbc	x1,xzr,xzr		// zap x1
+
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adc	x11,x17,x13
+	cmp	x1,xzr			// did subtraction borrow?
+
+	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,eq
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+
+.def __ecp_nistz256_div_by_2
+   .type 32
+.endef
+.align	4
+__ecp_nistz256_div_by_2:
+	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
+	adcs	x9,x15,x12
+	adcs	x10,x16,xzr
+	adcs	x11,x17,x13
+	adc	x1,xzr,xzr		// zap x1
+	tst	x14,#1		// is a even?
+
+	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
+	csel	x15,x15,x9,eq
+	csel	x16,x16,x10,eq
+	csel	x17,x17,x11,eq
+	csel	x1,xzr,x1,eq
+
+	lsr	x14,x14,#1		// ret >>= 1
+	orr	x14,x14,x15,lsl#63
+	lsr	x15,x15,#1
+	orr	x15,x15,x16,lsl#63
+	lsr	x16,x16,#1
+	orr	x16,x16,x17,lsl#63
+	lsr	x17,x17,#1
+	stp	x14,x15,[x0]
+	orr	x17,x17,x1,lsl#63
+	stp	x16,x17,[x0,#16]
+
+	ret
+
+.globl	ecp_nistz256_point_double
+
+.def ecp_nistz256_point_double
+   .type 32
+.endef
+.align	5
+ecp_nistz256_point_double:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	sub	sp,sp,#32*4
+
+Ldouble_shortcut:
+	ldp	x14,x15,[x1,#32]
+	mov	x21,x0
+	ldp	x16,x17,[x1,#48]
+	mov	x22,x1
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	mov	x8,x14
+	ldr	x13,[x13,#24]
+	mov	x9,x15
+	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[x22,#64+16]
+	add	x0,sp,#0
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
+
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
+
+	ldp	x8,x9,[x22]
+	ldp	x10,x11,[x22,#16]
+	mov	x4,x14		// put Zsqr aside for p256_sub
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
+
+	add	x2,x22,#0
+	mov	x14,x4		// restore Zsqr
+	mov	x15,x5
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x16,x6
+	mov	x17,x7
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,sp,#64
+	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
+
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[x22,#64]
+	ldp	x6,x7,[x22,#64+16]
+	add	x2,x22,#32
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#0+16]
+	add	x0,x21,#64
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
+
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
+
+	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,x21,#32
+	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
+
+	add	x2,sp,#64
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
+
+	mov	x8,x14		// duplicate M
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	mov	x4,x14		// put M aside
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x0,sp,#32
+	bl	__ecp_nistz256_add_to
+	mov	x8,x4			// restore M
+	mov	x9,x5
+	ldr	x3,[x22]		// forward load for p256_mul_mont
+	mov	x10,x6
+	ldp	x4,x5,[sp,#0]
+	mov	x11,x7
+	ldp	x6,x7,[sp,#0+16]
+	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
+
+	add	x2,x22,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
+
+	mov	x8,x14
+	mov	x9,x15
+	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
+	mov	x10,x16
+	mov	x11,x17
+	ldp	x6,x7,[sp,#32+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
+
+	add	x0,x21,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
+
+	add	x2,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
+
+	add	x2,sp,#0
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
+
+	ldr	x3,[sp,#32]
+	mov	x4,x14		// copy S
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	add	x2,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
+
+	add	x2,x21,#32
+	add	x0,x21,#32
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	ecp_nistz256_point_add
+
+.def ecp_nistz256_point_add
+   .type 32
+.endef
+.align	5
+ecp_nistz256_point_add:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#32*12
+
+	ldp	x4,x5,[x2,#64]	// in2_z
+	ldp	x6,x7,[x2,#64+16]
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x25,x8,x10
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
+
+	ldp	x4,x5,[x22,#64]	// in1_z
+	ldp	x6,x7,[x22,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x2,x23,#64
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x22,#64
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#32]
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x2,x22,#32
+	add	x0,sp,#320
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#352]
+	ldp	x6,x7,[sp,#352+16]
+	add	x2,x23,#32
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,sp,#320
+	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
+	ldp	x4,x5,[x22]
+	ldp	x6,x7,[x22,#16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x26,x14,x16	// ~is_equal(S1,S2)
+
+	add	x2,sp,#192
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[x23]
+	ldp	x6,x7,[x23,#16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
+
+	add	x2,sp,#256
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#96
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
+
+	orr	x14,x14,x15	// see if result is zero
+	orr	x16,x16,x17
+	orr	x14,x14,x16	// ~is_equal(U1,U2)
+
+	mvn	x27,x24	// -1/0 -> 0/-1
+	mvn	x28,x25	// -1/0 -> 0/-1
+	orr	x14,x14,x27
+	orr	x14,x14,x28
+	orr	x14,x14,x26
+	cbnz	x14,Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+Ladd_double:
+	mov	x1,x22
+	mov	x0,x21
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
+	b	Ldouble_shortcut
+
+.align	4
+Ladd_proceed:
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldp	x4,x5,[sp,#96]
+	ldp	x6,x7,[sp,#96+16]
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldr	x3,[x23,#64]
+	ldp	x4,x5,[sp,#64]
+	ldp	x6,x7,[sp,#64+16]
+	add	x2,x23,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
+
+	ldr	x3,[sp,#96]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,sp,#96
+	add	x0,sp,#224
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[sp,#128]
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x2,sp,#128
+	add	x0,sp,#288
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#128
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#192
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#224
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#288
+	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#320]
+	ldp	x6,x7,[sp,#320+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,sp,#224
+	add	x0,sp,#352
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#160
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#352
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+Ladd_done:
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	ecp_nistz256_point_add_affine
+
+.def ecp_nistz256_point_add_affine
+   .type 32
+.endef
+.align	5
+ecp_nistz256_point_add_affine:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	sub	sp,sp,#32*10
+
+	mov	x21,x0
+	mov	x22,x1
+	mov	x23,x2
+	adrp	x13,Lpoly
+	add	x13,x13,:lo12:Lpoly
+	ldr	x12,[x13,#8]
+	ldr	x13,[x13,#24]
+
+	ldp	x4,x5,[x1,#64]	// in1_z
+	ldp	x6,x7,[x1,#64+16]
+	orr	x8,x4,x5
+	orr	x10,x6,x7
+	orr	x24,x8,x10
+	cmp	x24,#0
+	csetm	x24,ne		// ~in1infty
+
+	ldp	x14,x15,[x2]	// in2_x
+	ldp	x16,x17,[x2,#16]
+	ldp	x8,x9,[x2,#32]	// in2_y
+	ldp	x10,x11,[x2,#48]
+	orr	x14,x14,x15
+	orr	x16,x16,x17
+	orr	x8,x8,x9
+	orr	x10,x10,x11
+	orr	x14,x14,x16
+	orr	x8,x8,x10
+	orr	x25,x14,x8
+	cmp	x25,#0
+	csetm	x25,ne		// ~in2infty
+
+	add	x0,sp,#128
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	mov	x4,x14
+	mov	x5,x15
+	mov	x6,x16
+	mov	x7,x17
+	ldr	x3,[x23]
+	add	x2,x23,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
+
+	add	x2,x22,#0
+	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x0,sp,#160
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
+
+	add	x2,x22,#64
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	x3,[x22,#64]
+	ldp	x4,x5,[sp,#160]
+	ldp	x6,x7,[sp,#160+16]
+	add	x2,x22,#64
+	add	x0,sp,#64
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldr	x3,[x23,#32]
+	ldp	x4,x5,[sp,#128]
+	ldp	x6,x7,[sp,#128+16]
+	add	x2,x23,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	x2,x22,#32
+	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
+	ldp	x6,x7,[sp,#160+16]
+	add	x0,sp,#192
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
+
+	add	x0,sp,#224
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldp	x4,x5,[sp,#192]
+	ldp	x6,x7,[sp,#192+16]
+	add	x0,sp,#288
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	x3,[sp,#160]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,sp,#160
+	add	x0,sp,#256
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	x3,[x22]
+	ldp	x4,x5,[sp,#224]
+	ldp	x6,x7,[sp,#224+16]
+	add	x2,x22,#0
+	add	x0,sp,#96
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
+
+	mov	x8,x14
+	mov	x9,x15
+	mov	x10,x16
+	mov	x11,x17
+	add	x0,sp,#224
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	x2,sp,#288
+	add	x0,sp,#0
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	x2,sp,#256
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	x2,sp,#96
+	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
+	ldp	x4,x5,[sp,#256]
+	ldp	x6,x7,[sp,#256+16]
+	add	x0,sp,#32
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	x2,x22,#32
+	add	x0,sp,#128
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
+
+	ldr	x3,[sp,#192]
+	ldp	x4,x5,[sp,#32]
+	ldp	x6,x7,[sp,#32+16]
+	add	x2,sp,#192
+	add	x0,sp,#32
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	x2,sp,#128
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	x4,x5,[sp,#0]		// res
+	ldp	x6,x7,[sp,#0+16]
+	ldp	x8,x9,[x23]		// in2
+	ldp	x10,x11,[x23,#16]
+	ldp	x14,x15,[x22,#0]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#0+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+0+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+0+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#0+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#0+48]
+	stp	x14,x15,[x21,#0]
+	stp	x16,x17,[x21,#0+16]
+	adrp	x23,Lone_mont-64
+	add	x23,x23,:lo12:Lone_mont-64
+	ldp	x14,x15,[x22,#32]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#32+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	ldp	x4,x5,[sp,#0+32+32]	// res
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	ldp	x6,x7,[sp,#0+32+48]
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	ldp	x8,x9,[x23,#32+32]	// in2
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	ldp	x10,x11,[x23,#32+48]
+	stp	x14,x15,[x21,#32]
+	stp	x16,x17,[x21,#32+16]
+	ldp	x14,x15,[x22,#64]	// in1
+	cmp	x24,#0			// ~, remember?
+	ldp	x16,x17,[x22,#64+16]
+	csel	x8,x4,x8,ne
+	csel	x9,x5,x9,ne
+	csel	x10,x6,x10,ne
+	csel	x11,x7,x11,ne
+	cmp	x25,#0			// ~, remember?
+	csel	x14,x8,x14,ne
+	csel	x15,x9,x15,ne
+	csel	x16,x10,x16,ne
+	csel	x17,x11,x17,ne
+	stp	x14,x15,[x21,#64]
+	stp	x16,x17,[x21,#64+16]
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x29,x30,[sp],#80
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl	ecp_nistz256_ord_mul_mont
+
+.def ecp_nistz256_ord_mul_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_ord_mul_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,Lord
+	add	x23,x23,:lo12:Lord
+	ldr	x3,[x2]		// bp[0]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+
+	mul	x14,x4,x3		// a[0]*b[0]
+	umulh	x8,x4,x3
+
+	mul	x15,x5,x3		// a[1]*b[0]
+	umulh	x9,x5,x3
+
+	mul	x16,x6,x3		// a[2]*b[0]
+	umulh	x10,x6,x3
+
+	mul	x17,x7,x3		// a[3]*b[0]
+	umulh	x19,x7,x3
+
+	mul	x24,x14,x23
+
+	adds	x15,x15,x8		// accumulate high parts of multiplication
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adc	x19,x19,xzr
+	mov	x20,xzr
+	ldr	x3,[x2,#8*1]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*2]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	ldr	x3,[x2,#8*3]		// b[i]
+
+	lsl	x8,x24,#32
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	mul	x8,x4,x3
+	adc	x11,x11,xzr
+	mul	x9,x5,x3
+
+	adds	x14,x15,x10
+	mul	x10,x6,x3
+	adcs	x15,x16,x11
+	mul	x11,x7,x3
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	adds	x14,x14,x8		// accumulate low parts
+	umulh	x8,x4,x3
+	adcs	x15,x15,x9
+	umulh	x9,x5,x3
+	adcs	x16,x16,x10
+	umulh	x10,x6,x3
+	adcs	x17,x17,x11
+	umulh	x11,x7,x3
+	adc	x19,x19,xzr
+	mul	x24,x14,x23
+	adds	x15,x15,x8		// accumulate high parts
+	adcs	x16,x16,x9
+	adcs	x17,x17,x10
+	adcs	x19,x19,x11
+	adc	x20,xzr,xzr
+	lsl	x8,x24,#32		// last reduction
+	subs	x16,x16,x24
+	lsr	x9,x24,#32
+	sbcs	x17,x17,x8
+	sbcs	x19,x19,x9
+	sbc	x20,x20,xzr
+
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adcs	x17,x19,x24
+	adc	x19,x20,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x15,x15,x9,lo
+	csel	x16,x16,x10,lo
+	stp	x14,x15,[x0]
+	csel	x17,x17,x11,lo
+	stp	x16,x17,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t rep);
+.globl	ecp_nistz256_ord_sqr_mont
+
+.def ecp_nistz256_ord_sqr_mont
+   .type 32
+.endef
+.align	4
+ecp_nistz256_ord_sqr_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adrp	x23,Lord
+	add	x23,x23,:lo12:Lord
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+	ldp	x12,x13,[x23,#0]
+	ldp	x21,x22,[x23,#16]
+	ldr	x23,[x23,#32]
+	b	Loop_ord_sqr
+
+.align	4
+Loop_ord_sqr:
+	sub	x2,x2,#1
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x15,x5,x4		// a[1]*a[0]
+	umulh	x9,x5,x4
+	mul	x16,x6,x4		// a[2]*a[0]
+	umulh	x10,x6,x4
+	mul	x17,x7,x4		// a[3]*a[0]
+	umulh	x19,x7,x4
+
+	adds	x16,x16,x9		// accumulate high parts of multiplication
+	mul	x8,x6,x5		// a[2]*a[1]
+	umulh	x9,x6,x5
+	adcs	x17,x17,x10
+	mul	x10,x7,x5		// a[3]*a[1]
+	umulh	x11,x7,x5
+	adc	x19,x19,xzr		// can't overflow
+
+	mul	x20,x7,x6		// a[3]*a[2]
+	umulh	x1,x7,x6
+
+	adds	x9,x9,x10		// accumulate high parts of multiplication
+	mul	x14,x4,x4		// a[0]*a[0]
+	adc	x10,x11,xzr		// can't overflow
+
+	adds	x17,x17,x8		// accumulate low parts of multiplication
+	umulh	x4,x4,x4
+	adcs	x19,x19,x9
+	mul	x9,x5,x5		// a[1]*a[1]
+	adcs	x20,x20,x10
+	umulh	x5,x5,x5
+	adc	x1,x1,xzr		// can't overflow
+
+	adds	x15,x15,x15	// acc[1-6]*=2
+	mul	x10,x6,x6		// a[2]*a[2]
+	adcs	x16,x16,x16
+	umulh	x6,x6,x6
+	adcs	x17,x17,x17
+	mul	x11,x7,x7		// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x7,x7,x7
+	adcs	x20,x20,x20
+	adcs	x1,x1,x1
+	adc	x3,xzr,xzr
+
+	adds	x15,x15,x4		// +a[i]*a[i]
+	mul	x24,x14,x23
+	adcs	x16,x16,x9
+	adcs	x17,x17,x5
+	adcs	x19,x19,x10
+	adcs	x20,x20,x6
+	adcs	x1,x1,x11
+	adc	x3,x3,x7
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	mul	x24,x14,x23
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x24
+	mul	x10,x13,x24
+	umulh	x11,x13,x24
+
+	adcs	x10,x10,x9
+	adc	x11,x11,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x11
+	adcs	x16,x17,x24
+	adc	x17,xzr,x24		// can't overflow
+	mul	x11,x14,x23
+	lsl	x8,x24,#32
+	subs	x15,x15,x24
+	lsr	x9,x24,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	subs	xzr,x14,#1
+	umulh	x9,x12,x11
+	mul	x10,x13,x11
+	umulh	x24,x13,x11
+
+	adcs	x10,x10,x9
+	adc	x24,x24,xzr
+
+	adds	x14,x15,x10
+	adcs	x15,x16,x24
+	adcs	x16,x17,x11
+	adc	x17,xzr,x11		// can't overflow
+	lsl	x8,x11,#32
+	subs	x15,x15,x11
+	lsr	x9,x11,#32
+	sbcs	x16,x16,x8
+	sbc	x17,x17,x9		// can't borrow
+	adds	x14,x14,x19	// accumulate upper half
+	adcs	x15,x15,x20
+	adcs	x16,x16,x1
+	adcs	x17,x17,x3
+	adc	x19,xzr,xzr
+
+	subs	x8,x14,x12		// ret -= modulus
+	sbcs	x9,x15,x13
+	sbcs	x10,x16,x21
+	sbcs	x11,x17,x22
+	sbcs	xzr,x19,xzr
+
+	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
+	csel	x5,x15,x9,lo
+	csel	x6,x16,x10,lo
+	csel	x7,x17,x11,lo
+
+	cbnz	x2,Loop_ord_sqr
+
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w5
+
+.def ecp_nistz256_select_w5
+   .type 32
+.endef
+.align	4
+ecp_nistz256_select_w5:
+	AARCH64_VALID_CALL_TARGET
+
+    // x10 := x0
+    // w9 := 0; loop counter and incremented internal index
+	mov	x10, x0
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+	movi	v20.16b, #0
+	movi	v21.16b, #0
+
+Lselect_w5_loop:
+    // Loop 16 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // continue loading ...
+	ld1	{v26.2d, v27.2d}, [x1],#32
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+	bit	v20.16b, v26.16b, v3.16b
+	bit	v21.16b, v27.16b, v3.16b
+
+    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+	tbz	w9, #4, Lselect_w5_loop
+
+    // Write [v16-v21] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
+	st1	{v20.2d, v21.2d}, [x10]
+
+	ret
+
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w7
+
+.def ecp_nistz256_select_w7
+   .type 32
+.endef
+.align	4
+ecp_nistz256_select_w7:
+	AARCH64_VALID_CALL_TARGET
+
+    // w9 := 0; loop counter and incremented internal index
+	mov	w9, #0
+
+    // [v16-v21] := 0
+	movi	v16.16b, #0
+	movi	v17.16b, #0
+	movi	v18.16b, #0
+	movi	v19.16b, #0
+
+Lselect_w7_loop:
+    // Loop 64 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+	add	w9, w9, #1
+
+    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
+    //  and advance x1 to point to the next entry
+	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+    // x11 := (w9 == w2)? All 1s : All 0s
+	cmp	w9, w2
+	csetm	x11, eq
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+	dup	v3.2d, x11
+
+    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+    // i.e., values in output registers will remain the same if w9 != w2
+	bit	v16.16b, v22.16b, v3.16b
+	bit	v17.16b, v23.16b, v3.16b
+
+	bit	v18.16b, v24.16b, v3.16b
+	bit	v19.16b, v25.16b, v3.16b
+
+    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+	tbz	w9, #6, Lselect_w7_loop
+
+    // Write [v16-v19] to memory at the output pointer
+	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
+
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-apple.S
similarity index 95%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-apple.S
index f9ef5575..0d3c4d85 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -9,7 +8,6 @@
 .text	
 
 
-
 .section	__DATA,__const
 .p2align	6
 L$poly:
@@ -94,18 +92,13 @@ L$neg_epilogue:
 
 
 
-.globl	_ecp_nistz256_ord_mul_mont
-.private_extern _ecp_nistz256_ord_mul_mont
+.globl	_ecp_nistz256_ord_mul_mont_nohw
+.private_extern _ecp_nistz256_ord_mul_mont_nohw
 
 .p2align	5
-_ecp_nistz256_ord_mul_mont:
+_ecp_nistz256_ord_mul_mont_nohw:
 
 _CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	L$ecp_nistz256_ord_mul_montx
 	pushq	%rbp
 
 	pushq	%rbx
@@ -423,18 +416,13 @@ L$ord_mul_epilogue:
 
 
 
-.globl	_ecp_nistz256_ord_sqr_mont
-.private_extern _ecp_nistz256_ord_sqr_mont
+.globl	_ecp_nistz256_ord_sqr_mont_nohw
+.private_extern _ecp_nistz256_ord_sqr_mont_nohw
 
 .p2align	5
-_ecp_nistz256_ord_sqr_mont:
+_ecp_nistz256_ord_sqr_mont_nohw:
 
 _CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	L$ecp_nistz256_ord_sqr_montx
 	pushq	%rbp
 
 	pushq	%rbx
@@ -716,11 +704,14 @@ L$ord_sqr_epilogue:
 
 
 
+.globl	_ecp_nistz256_ord_mul_mont_adx
+.private_extern _ecp_nistz256_ord_mul_mont_adx
 
 .p2align	5
-ecp_nistz256_ord_mul_montx:
+_ecp_nistz256_ord_mul_mont_adx:
 
-L$ecp_nistz256_ord_mul_montx:
+L$ecp_nistz256_ord_mul_mont_adx:
+_CET_ENDBR
 	pushq	%rbp
 
 	pushq	%rbx
@@ -952,11 +943,14 @@ L$ord_mulx_epilogue:
 
 
 
+.globl	_ecp_nistz256_ord_sqr_mont_adx
+.private_extern _ecp_nistz256_ord_sqr_mont_adx
 
 .p2align	5
-ecp_nistz256_ord_sqr_montx:
+_ecp_nistz256_ord_sqr_mont_adx:
 
-L$ecp_nistz256_ord_sqr_montx:
+_CET_ENDBR
+L$ecp_nistz256_ord_sqr_mont_adx:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1165,17 +1159,13 @@ L$ord_sqrx_epilogue:
 
 
 
-.globl	_ecp_nistz256_mul_mont
-.private_extern _ecp_nistz256_mul_mont
+.globl	_ecp_nistz256_mul_mont_nohw
+.private_extern _ecp_nistz256_mul_mont_nohw
 
 .p2align	5
-_ecp_nistz256_mul_mont:
+_ecp_nistz256_mul_mont_nohw:
 
 _CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-L$mul_mont:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1189,8 +1179,6 @@ L$mul_mont:
 	pushq	%r15
 
 L$mul_body:
-	cmpl	$0x80100,%ecx
-	je	L$mul_montx
 	movq	%rdx,%rbx
 	movq	0(%rdx),%rax
 	movq	0(%rsi),%r9
@@ -1199,20 +1187,7 @@ L$mul_body:
 	movq	24(%rsi),%r12
 
 	call	__ecp_nistz256_mul_montq
-	jmp	L$mul_mont_done
-
-.p2align	5
-L$mul_montx:
-	movq	%rdx,%rbx
-	movq	0(%rdx),%rdx
-	movq	0(%rsi),%r9
-	movq	8(%rsi),%r10
-	movq	16(%rsi),%r11
-	movq	24(%rsi),%r12
-	leaq	-128(%rsi),%rsi
 
-	call	__ecp_nistz256_mul_montx
-L$mul_mont_done:
 	movq	0(%rsp),%r15
 
 	movq	8(%rsp),%r14
@@ -1457,16 +1432,13 @@ __ecp_nistz256_mul_montq:
 
 
 
-.globl	_ecp_nistz256_sqr_mont
-.private_extern _ecp_nistz256_sqr_mont
+.globl	_ecp_nistz256_sqr_mont_nohw
+.private_extern _ecp_nistz256_sqr_mont_nohw
 
 .p2align	5
-_ecp_nistz256_sqr_mont:
+_ecp_nistz256_sqr_mont_nohw:
 
 _CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1480,26 +1452,13 @@ _CET_ENDBR
 	pushq	%r15
 
 L$sqr_body:
-	cmpl	$0x80100,%ecx
-	je	L$sqr_montx
 	movq	0(%rsi),%rax
 	movq	8(%rsi),%r14
 	movq	16(%rsi),%r15
 	movq	24(%rsi),%r8
 
 	call	__ecp_nistz256_sqr_montq
-	jmp	L$sqr_mont_done
-
-.p2align	5
-L$sqr_montx:
-	movq	0(%rsi),%rdx
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r15
-	movq	24(%rsi),%r8
-	leaq	-128(%rsi),%rsi
 
-	call	__ecp_nistz256_sqr_montx
-L$sqr_mont_done:
 	movq	0(%rsp),%r15
 
 	movq	8(%rsp),%r14
@@ -1682,6 +1641,55 @@ __ecp_nistz256_sqr_montq:
 	ret
 
 
+.globl	_ecp_nistz256_mul_mont_adx
+.private_extern _ecp_nistz256_mul_mont_adx
+
+.p2align	5
+_ecp_nistz256_mul_mont_adx:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$mulx_body:
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_mul_montx
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$mulx_epilogue:
+	ret
+
+
+
 
 .p2align	5
 __ecp_nistz256_mul_montx:
@@ -1851,6 +1859,53 @@ __ecp_nistz256_mul_montx:
 
 
 
+.globl	_ecp_nistz256_sqr_mont_adx
+.private_extern _ecp_nistz256_sqr_mont_adx
+
+.p2align	5
+_ecp_nistz256_sqr_mont_adx:
+
+_CET_ENDBR
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+L$sqrx_body:
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+L$sqrx_epilogue:
+	ret
+
+
+
 
 .p2align	5
 __ecp_nistz256_sqr_montx:
@@ -1982,17 +2037,13 @@ __ecp_nistz256_sqr_montx:
 
 
 
-.globl	_ecp_nistz256_select_w5
-.private_extern _ecp_nistz256_select_w5
+.globl	_ecp_nistz256_select_w5_nohw
+.private_extern _ecp_nistz256_select_w5_nohw
 
 .p2align	5
-_ecp_nistz256_select_w5:
+_ecp_nistz256_select_w5_nohw:
 
 _CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rax
-	movq	8(%rax),%rax
-	testl	$32,%eax
-	jnz	L$avx2_select_w5
 	movdqa	L$One(%rip),%xmm0
 	movd	%edx,%xmm1
 
@@ -2045,22 +2096,18 @@ L$select_loop_sse_w5:
 	movdqu	%xmm7,80(%rdi)
 	ret
 
-L$SEH_end_ecp_nistz256_select_w5:
+L$SEH_end_ecp_nistz256_select_w5_nohw:
 
 
 
 
-.globl	_ecp_nistz256_select_w7
-.private_extern _ecp_nistz256_select_w7
+.globl	_ecp_nistz256_select_w7_nohw
+.private_extern _ecp_nistz256_select_w7_nohw
 
 .p2align	5
-_ecp_nistz256_select_w7:
+_ecp_nistz256_select_w7_nohw:
 
 _CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rax
-	movq	8(%rax),%rax
-	testl	$32,%eax
-	jnz	L$avx2_select_w7
 	movdqa	L$One(%rip),%xmm8
 	movd	%edx,%xmm1
 
@@ -2102,15 +2149,17 @@ L$select_loop_sse_w7:
 	movdqu	%xmm5,48(%rdi)
 	ret
 
-L$SEH_end_ecp_nistz256_select_w7:
+L$SEH_end_ecp_nistz256_select_w7_nohw:
 
 
 
+.globl	_ecp_nistz256_select_w5_avx2
+.private_extern _ecp_nistz256_select_w5_avx2
 
 .p2align	5
-ecp_nistz256_avx2_select_w5:
+_ecp_nistz256_select_w5_avx2:
 
-L$avx2_select_w5:
+_CET_ENDBR
 	vzeroupper
 	vmovdqa	L$Two(%rip),%ymm0
 
@@ -2165,18 +2214,17 @@ L$select_loop_avx2_w5:
 	vzeroupper
 	ret
 
-L$SEH_end_ecp_nistz256_avx2_select_w5:
+L$SEH_end_ecp_nistz256_select_w5_avx2:
 
 
 
 
-.globl	_ecp_nistz256_avx2_select_w7
-.private_extern _ecp_nistz256_avx2_select_w7
+.globl	_ecp_nistz256_select_w7_avx2
+.private_extern _ecp_nistz256_select_w7_avx2
 
 .p2align	5
-_ecp_nistz256_avx2_select_w7:
+_ecp_nistz256_select_w7_avx2:
 
-L$avx2_select_w7:
 _CET_ENDBR
 	vzeroupper
 	vmovdqa	L$Three(%rip),%ymm0
@@ -2247,7 +2295,7 @@ L$select_loop_avx2_w7:
 	vzeroupper
 	ret
 
-L$SEH_end_ecp_nistz256_avx2_select_w7:
+L$SEH_end_ecp_nistz256_select_w7_avx2:
 
 
 .p2align	5
@@ -2378,18 +2426,13 @@ __ecp_nistz256_mul_by_2q:
 	ret
 
 
-.globl	_ecp_nistz256_point_double
-.private_extern _ecp_nistz256_point_double
+.globl	_ecp_nistz256_point_double_nohw
+.private_extern _ecp_nistz256_point_double_nohw
 
 .p2align	5
-_ecp_nistz256_point_double:
+_ecp_nistz256_point_double_nohw:
 
 _CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	L$point_doublex
 	pushq	%rbp
 
 	pushq	%rbx
@@ -2607,18 +2650,13 @@ L$point_doubleq_epilogue:
 	ret
 
 
-.globl	_ecp_nistz256_point_add
-.private_extern _ecp_nistz256_point_add
+.globl	_ecp_nistz256_point_add_nohw
+.private_extern _ecp_nistz256_point_add_nohw
 
 .p2align	5
-_ecp_nistz256_point_add:
+_ecp_nistz256_point_add_nohw:
 
 _CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	L$point_addx
 	pushq	%rbp
 
 	pushq	%rbx
@@ -3039,18 +3077,13 @@ L$point_addq_epilogue:
 	ret
 
 
-.globl	_ecp_nistz256_point_add_affine
-.private_extern _ecp_nistz256_point_add_affine
+.globl	_ecp_nistz256_point_add_affine_nohw
+.private_extern _ecp_nistz256_point_add_affine_nohw
 
 .p2align	5
-_ecp_nistz256_point_add_affine:
+_ecp_nistz256_point_add_affine_nohw:
 
 _CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	L$point_add_affinex
 	pushq	%rbp
 
 	pushq	%rbx
@@ -3503,11 +3536,13 @@ __ecp_nistz256_mul_by_2x:
 	ret
 
 
+.globl	_ecp_nistz256_point_double_adx
+.private_extern _ecp_nistz256_point_double_adx
 
 .p2align	5
-ecp_nistz256_point_doublex:
+_ecp_nistz256_point_double_adx:
 
-L$point_doublex:
+_CET_ENDBR
 	pushq	%rbp
 
 	pushq	%rbx
@@ -3725,11 +3760,13 @@ L$point_doublex_epilogue:
 	ret
 
 
+.globl	_ecp_nistz256_point_add_adx
+.private_extern _ecp_nistz256_point_add_adx
 
 .p2align	5
-ecp_nistz256_point_addx:
+_ecp_nistz256_point_add_adx:
 
-L$point_addx:
+_CET_ENDBR
 	pushq	%rbp
 
 	pushq	%rbx
@@ -4150,11 +4187,13 @@ L$point_addx_epilogue:
 	ret
 
 
+.globl	_ecp_nistz256_point_add_affine_adx
+.private_extern _ecp_nistz256_point_add_affine_adx
 
 .p2align	5
-ecp_nistz256_point_add_affinex:
+_ecp_nistz256_point_add_affine_adx:
 
-L$point_add_affinex:
+_CET_ENDBR
 	pushq	%rbp
 
 	pushq	%rbx
@@ -4473,7 +4512,6 @@ L$add_affinex_epilogue:
 
 
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-linux.S
similarity index 92%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-linux.S
index 7bd14501..eec11d7c 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256-x86_64-asm-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/p256-x86_64-asm-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -7,8 +6,6 @@
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
 .text	
-.extern	OPENSSL_ia32cap_P
-.hidden OPENSSL_ia32cap_P
 
 
 .section	.rodata
@@ -97,18 +94,13 @@ _CET_ENDBR
 
 
 
-.globl	ecp_nistz256_ord_mul_mont
-.hidden ecp_nistz256_ord_mul_mont
-.type	ecp_nistz256_ord_mul_mont,@function
+.globl	ecp_nistz256_ord_mul_mont_nohw
+.hidden ecp_nistz256_ord_mul_mont_nohw
+.type	ecp_nistz256_ord_mul_mont_nohw,@function
 .align	32
-ecp_nistz256_ord_mul_mont:
+ecp_nistz256_ord_mul_mont_nohw:
 .cfi_startproc	
 _CET_ENDBR
-	leaq	OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	.Lecp_nistz256_ord_mul_montx
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -424,7 +416,7 @@ _CET_ENDBR
 .Lord_mul_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+.size	ecp_nistz256_ord_mul_mont_nohw,.-ecp_nistz256_ord_mul_mont_nohw
 
 
 
@@ -432,18 +424,13 @@ _CET_ENDBR
 
 
 
-.globl	ecp_nistz256_ord_sqr_mont
-.hidden ecp_nistz256_ord_sqr_mont
-.type	ecp_nistz256_ord_sqr_mont,@function
+.globl	ecp_nistz256_ord_sqr_mont_nohw
+.hidden ecp_nistz256_ord_sqr_mont_nohw
+.type	ecp_nistz256_ord_sqr_mont_nohw,@function
 .align	32
-ecp_nistz256_ord_sqr_mont:
+ecp_nistz256_ord_sqr_mont_nohw:
 .cfi_startproc	
 _CET_ENDBR
-	leaq	OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	.Lecp_nistz256_ord_sqr_montx
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -729,13 +716,16 @@ _CET_ENDBR
 .Lord_sqr_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+.size	ecp_nistz256_ord_sqr_mont_nohw,.-ecp_nistz256_ord_sqr_mont_nohw
 
-.type	ecp_nistz256_ord_mul_montx,@function
+.globl	ecp_nistz256_ord_mul_mont_adx
+.hidden ecp_nistz256_ord_mul_mont_adx
+.type	ecp_nistz256_ord_mul_mont_adx,@function
 .align	32
-ecp_nistz256_ord_mul_montx:
+ecp_nistz256_ord_mul_mont_adx:
 .cfi_startproc	
-.Lecp_nistz256_ord_mul_montx:
+.Lecp_nistz256_ord_mul_mont_adx:
+_CET_ENDBR
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -971,13 +961,16 @@ ecp_nistz256_ord_mul_montx:
 .Lord_mulx_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+.size	ecp_nistz256_ord_mul_mont_adx,.-ecp_nistz256_ord_mul_mont_adx
 
-.type	ecp_nistz256_ord_sqr_montx,@function
+.globl	ecp_nistz256_ord_sqr_mont_adx
+.hidden ecp_nistz256_ord_sqr_mont_adx
+.type	ecp_nistz256_ord_sqr_mont_adx,@function
 .align	32
-ecp_nistz256_ord_sqr_montx:
+ecp_nistz256_ord_sqr_mont_adx:
 .cfi_startproc	
-.Lecp_nistz256_ord_sqr_montx:
+_CET_ENDBR
+.Lecp_nistz256_ord_sqr_mont_adx:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1185,24 +1178,20 @@ ecp_nistz256_ord_sqr_montx:
 .Lord_sqrx_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+.size	ecp_nistz256_ord_sqr_mont_adx,.-ecp_nistz256_ord_sqr_mont_adx
 
 
 
 
 
 
-.globl	ecp_nistz256_mul_mont
-.hidden ecp_nistz256_mul_mont
-.type	ecp_nistz256_mul_mont,@function
+.globl	ecp_nistz256_mul_mont_nohw
+.hidden ecp_nistz256_mul_mont_nohw
+.type	ecp_nistz256_mul_mont_nohw,@function
 .align	32
-ecp_nistz256_mul_mont:
+ecp_nistz256_mul_mont_nohw:
 .cfi_startproc	
 _CET_ENDBR
-	leaq	OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-.Lmul_mont:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1222,8 +1211,6 @@ _CET_ENDBR
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r15,-56
 .Lmul_body:
-	cmpl	$0x80100,%ecx
-	je	.Lmul_montx
 	movq	%rdx,%rbx
 	movq	0(%rdx),%rax
 	movq	0(%rsi),%r9
@@ -1232,20 +1219,7 @@ _CET_ENDBR
 	movq	24(%rsi),%r12
 
 	call	__ecp_nistz256_mul_montq
-	jmp	.Lmul_mont_done
 
-.align	32
-.Lmul_montx:
-	movq	%rdx,%rbx
-	movq	0(%rdx),%rdx
-	movq	0(%rsi),%r9
-	movq	8(%rsi),%r10
-	movq	16(%rsi),%r11
-	movq	24(%rsi),%r12
-	leaq	-128(%rsi),%rsi
-
-	call	__ecp_nistz256_mul_montx
-.Lmul_mont_done:
 	movq	0(%rsp),%r15
 .cfi_restore	%r15
 	movq	8(%rsp),%r14
@@ -1263,7 +1237,7 @@ _CET_ENDBR
 .Lmul_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+.size	ecp_nistz256_mul_mont_nohw,.-ecp_nistz256_mul_mont_nohw
 
 .type	__ecp_nistz256_mul_montq,@function
 .align	32
@@ -1490,16 +1464,13 @@ __ecp_nistz256_mul_montq:
 
 
 
-.globl	ecp_nistz256_sqr_mont
-.hidden ecp_nistz256_sqr_mont
-.type	ecp_nistz256_sqr_mont,@function
+.globl	ecp_nistz256_sqr_mont_nohw
+.hidden ecp_nistz256_sqr_mont_nohw
+.type	ecp_nistz256_sqr_mont_nohw,@function
 .align	32
-ecp_nistz256_sqr_mont:
+ecp_nistz256_sqr_mont_nohw:
 .cfi_startproc	
 _CET_ENDBR
-	leaq	OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1519,26 +1490,13 @@ _CET_ENDBR
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r15,-56
 .Lsqr_body:
-	cmpl	$0x80100,%ecx
-	je	.Lsqr_montx
 	movq	0(%rsi),%rax
 	movq	8(%rsi),%r14
 	movq	16(%rsi),%r15
 	movq	24(%rsi),%r8
 
 	call	__ecp_nistz256_sqr_montq
-	jmp	.Lsqr_mont_done
-
-.align	32
-.Lsqr_montx:
-	movq	0(%rsi),%rdx
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r15
-	movq	24(%rsi),%r8
-	leaq	-128(%rsi),%rsi
 
-	call	__ecp_nistz256_sqr_montx
-.Lsqr_mont_done:
 	movq	0(%rsp),%r15
 .cfi_restore	%r15
 	movq	8(%rsp),%r14
@@ -1556,7 +1514,7 @@ _CET_ENDBR
 .Lsqr_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+.size	ecp_nistz256_sqr_mont_nohw,.-ecp_nistz256_sqr_mont_nohw
 
 .type	__ecp_nistz256_sqr_montq,@function
 .align	32
@@ -1721,6 +1679,61 @@ __ecp_nistz256_sqr_montq:
 	ret
 .cfi_endproc	
 .size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.globl	ecp_nistz256_mul_mont_adx
+.hidden ecp_nistz256_mul_mont_adx
+.type	ecp_nistz256_mul_mont_adx,@function
+.align	32
+ecp_nistz256_mul_mont_adx:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lmulx_body:
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_mul_montx
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lmulx_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_mul_mont_adx,.-ecp_nistz256_mul_mont_adx
+
 .type	__ecp_nistz256_mul_montx,@function
 .align	32
 __ecp_nistz256_mul_montx:
@@ -1890,6 +1903,59 @@ __ecp_nistz256_mul_montx:
 .cfi_endproc	
 .size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
 
+.globl	ecp_nistz256_sqr_mont_adx
+.hidden ecp_nistz256_sqr_mont_adx
+.type	ecp_nistz256_sqr_mont_adx,@function
+.align	32
+ecp_nistz256_sqr_mont_adx:
+.cfi_startproc	
+_CET_ENDBR
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lsqrx_body:
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lsqrx_epilogue:
+	ret
+.cfi_endproc	
+.size	ecp_nistz256_sqr_mont_adx,.-ecp_nistz256_sqr_mont_adx
+
 .type	__ecp_nistz256_sqr_montx,@function
 .align	32
 __ecp_nistz256_sqr_montx:
@@ -2021,17 +2087,13 @@ __ecp_nistz256_sqr_montx:
 .size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
 
 
-.globl	ecp_nistz256_select_w5
-.hidden ecp_nistz256_select_w5
-.type	ecp_nistz256_select_w5,@function
+.globl	ecp_nistz256_select_w5_nohw
+.hidden ecp_nistz256_select_w5_nohw
+.type	ecp_nistz256_select_w5_nohw,@function
 .align	32
-ecp_nistz256_select_w5:
+ecp_nistz256_select_w5_nohw:
 .cfi_startproc	
 _CET_ENDBR
-	leaq	OPENSSL_ia32cap_P(%rip),%rax
-	movq	8(%rax),%rax
-	testl	$32,%eax
-	jnz	.Lavx2_select_w5
 	movdqa	.LOne(%rip),%xmm0
 	movd	%edx,%xmm1
 
@@ -2084,22 +2146,18 @@ _CET_ENDBR
 	movdqu	%xmm7,80(%rdi)
 	ret
 .cfi_endproc	
-.LSEH_end_ecp_nistz256_select_w5:
-.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+.LSEH_end_ecp_nistz256_select_w5_nohw:
+.size	ecp_nistz256_select_w5_nohw,.-ecp_nistz256_select_w5_nohw
 
 
 
-.globl	ecp_nistz256_select_w7
-.hidden ecp_nistz256_select_w7
-.type	ecp_nistz256_select_w7,@function
+.globl	ecp_nistz256_select_w7_nohw
+.hidden ecp_nistz256_select_w7_nohw
+.type	ecp_nistz256_select_w7_nohw,@function
 .align	32
-ecp_nistz256_select_w7:
+ecp_nistz256_select_w7_nohw:
 .cfi_startproc	
 _CET_ENDBR
-	leaq	OPENSSL_ia32cap_P(%rip),%rax
-	movq	8(%rax),%rax
-	testl	$32,%eax
-	jnz	.Lavx2_select_w7
 	movdqa	.LOne(%rip),%xmm8
 	movd	%edx,%xmm1
 
@@ -2141,15 +2199,17 @@ _CET_ENDBR
 	movdqu	%xmm5,48(%rdi)
 	ret
 .cfi_endproc	
-.LSEH_end_ecp_nistz256_select_w7:
-.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+.LSEH_end_ecp_nistz256_select_w7_nohw:
+.size	ecp_nistz256_select_w7_nohw,.-ecp_nistz256_select_w7_nohw
 
 
-.type	ecp_nistz256_avx2_select_w5,@function
+.globl	ecp_nistz256_select_w5_avx2
+.hidden ecp_nistz256_select_w5_avx2
+.type	ecp_nistz256_select_w5_avx2,@function
 .align	32
-ecp_nistz256_avx2_select_w5:
+ecp_nistz256_select_w5_avx2:
 .cfi_startproc	
-.Lavx2_select_w5:
+_CET_ENDBR
 	vzeroupper
 	vmovdqa	.LTwo(%rip),%ymm0
 
@@ -2204,18 +2264,17 @@ ecp_nistz256_avx2_select_w5:
 	vzeroupper
 	ret
 .cfi_endproc	
-.LSEH_end_ecp_nistz256_avx2_select_w5:
-.size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+.LSEH_end_ecp_nistz256_select_w5_avx2:
+.size	ecp_nistz256_select_w5_avx2,.-ecp_nistz256_select_w5_avx2
 
 
 
-.globl	ecp_nistz256_avx2_select_w7
-.hidden ecp_nistz256_avx2_select_w7
-.type	ecp_nistz256_avx2_select_w7,@function
+.globl	ecp_nistz256_select_w7_avx2
+.hidden ecp_nistz256_select_w7_avx2
+.type	ecp_nistz256_select_w7_avx2,@function
 .align	32
-ecp_nistz256_avx2_select_w7:
+ecp_nistz256_select_w7_avx2:
 .cfi_startproc	
-.Lavx2_select_w7:
 _CET_ENDBR
 	vzeroupper
 	vmovdqa	.LThree(%rip),%ymm0
@@ -2286,8 +2345,8 @@ _CET_ENDBR
 	vzeroupper
 	ret
 .cfi_endproc	
-.LSEH_end_ecp_nistz256_avx2_select_w7:
-.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+.LSEH_end_ecp_nistz256_select_w7_avx2:
+.size	ecp_nistz256_select_w7_avx2,.-ecp_nistz256_select_w7_avx2
 .type	__ecp_nistz256_add_toq,@function
 .align	32
 __ecp_nistz256_add_toq:
@@ -2417,18 +2476,13 @@ __ecp_nistz256_mul_by_2q:
 	ret
 .cfi_endproc	
 .size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
-.globl	ecp_nistz256_point_double
-.hidden ecp_nistz256_point_double
-.type	ecp_nistz256_point_double,@function
+.globl	ecp_nistz256_point_double_nohw
+.hidden ecp_nistz256_point_double_nohw
+.type	ecp_nistz256_point_double_nohw,@function
 .align	32
-ecp_nistz256_point_double:
+ecp_nistz256_point_double_nohw:
 .cfi_startproc	
 _CET_ENDBR
-	leaq	OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	.Lpoint_doublex
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2651,19 +2705,14 @@ _CET_ENDBR
 .Lpoint_doubleq_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
-.globl	ecp_nistz256_point_add
-.hidden ecp_nistz256_point_add
-.type	ecp_nistz256_point_add,@function
+.size	ecp_nistz256_point_double_nohw,.-ecp_nistz256_point_double_nohw
+.globl	ecp_nistz256_point_add_nohw
+.hidden ecp_nistz256_point_add_nohw
+.type	ecp_nistz256_point_add_nohw,@function
 .align	32
-ecp_nistz256_point_add:
+ecp_nistz256_point_add_nohw:
 .cfi_startproc	
 _CET_ENDBR
-	leaq	OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	.Lpoint_addx
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -3089,19 +3138,14 @@ _CET_ENDBR
 .Lpoint_addq_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
-.globl	ecp_nistz256_point_add_affine
-.hidden ecp_nistz256_point_add_affine
-.type	ecp_nistz256_point_add_affine,@function
+.size	ecp_nistz256_point_add_nohw,.-ecp_nistz256_point_add_nohw
+.globl	ecp_nistz256_point_add_affine_nohw
+.hidden ecp_nistz256_point_add_affine_nohw
+.type	ecp_nistz256_point_add_affine_nohw,@function
 .align	32
-ecp_nistz256_point_add_affine:
+ecp_nistz256_point_add_affine_nohw:
 .cfi_startproc	
 _CET_ENDBR
-	leaq	OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	.Lpoint_add_affinex
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -3424,7 +3468,7 @@ _CET_ENDBR
 .Ladd_affineq_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+.size	ecp_nistz256_point_add_affine_nohw,.-ecp_nistz256_point_add_affine_nohw
 .type	__ecp_nistz256_add_tox,@function
 .align	32
 __ecp_nistz256_add_tox:
@@ -3560,11 +3604,13 @@ __ecp_nistz256_mul_by_2x:
 	ret
 .cfi_endproc	
 .size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
-.type	ecp_nistz256_point_doublex,@function
+.globl	ecp_nistz256_point_double_adx
+.hidden ecp_nistz256_point_double_adx
+.type	ecp_nistz256_point_double_adx,@function
 .align	32
-ecp_nistz256_point_doublex:
+ecp_nistz256_point_double_adx:
 .cfi_startproc	
-.Lpoint_doublex:
+_CET_ENDBR
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -3787,12 +3833,14 @@ ecp_nistz256_point_doublex:
 .Lpoint_doublex_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
-.type	ecp_nistz256_point_addx,@function
+.size	ecp_nistz256_point_double_adx,.-ecp_nistz256_point_double_adx
+.globl	ecp_nistz256_point_add_adx
+.hidden ecp_nistz256_point_add_adx
+.type	ecp_nistz256_point_add_adx,@function
 .align	32
-ecp_nistz256_point_addx:
+ecp_nistz256_point_add_adx:
 .cfi_startproc	
-.Lpoint_addx:
+_CET_ENDBR
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -4218,12 +4266,14 @@ ecp_nistz256_point_addx:
 .Lpoint_addx_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
-.type	ecp_nistz256_point_add_affinex,@function
+.size	ecp_nistz256_point_add_adx,.-ecp_nistz256_point_add_adx
+.globl	ecp_nistz256_point_add_affine_adx
+.hidden ecp_nistz256_point_add_affine_adx
+.type	ecp_nistz256_point_add_affine_adx,@function
 .align	32
-ecp_nistz256_point_add_affinex:
+ecp_nistz256_point_add_affine_adx:
 .cfi_startproc	
-.Lpoint_add_affinex:
+_CET_ENDBR
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -4546,9 +4596,8 @@ ecp_nistz256_point_add_affinex:
 .Ladd_affinex_epilogue:
 	ret
 .cfi_endproc	
-.size	ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
+.size	ecp_nistz256_point_add_affine_adx,.-ecp_nistz256_point_add_affine_adx
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-apple.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-apple.S
index 9a5941d6..1acdb0b2 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -309,7 +308,6 @@ Lbeeu_finish:
 	ret
 
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-linux.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-linux.S
index 7f6451f7..061c9d54 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-armv8-asm-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -309,7 +308,6 @@ beeu_mod_inverse_vartime:
 	ret
 .size	beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-win.S b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-win.S
new file mode 100644
index 00000000..2f7905d2
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-armv8-asm-win.S
@@ -0,0 +1,314 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include "CCryptoBoringSSL_arm_arch.h"
+
+.text
+.globl	beeu_mod_inverse_vartime
+
+
+.align	4
+beeu_mod_inverse_vartime:
+    // Reserve enough space for 14 8-byte registers on the stack
+    // in the first stp call for x29, x30.
+    // Then store the remaining callee-saved registers.
+    //
+    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
+    //    ^                                                     ^
+    //    sp  <------------------- 112 bytes ----------------> old sp
+    //   x29 (FP)
+    //
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-112]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x2,[sp,#96]
+
+    // B = b3..b0 := a
+	ldp	x25,x26,[x1]
+	ldp	x27,x28,[x1,#16]
+
+    // n3..n0 := n
+    // Note: the value of input params are changed in the following.
+	ldp	x0,x1,[x2]
+	ldp	x2,x30,[x2,#16]
+
+    // A = a3..a0 := n
+	mov	x21, x0
+	mov	x22, x1
+	mov	x23, x2
+	mov	x24, x30
+
+    // X = x4..x0 := 1
+	mov	x3, #1
+	eor	x4, x4, x4
+	eor	x5, x5, x5
+	eor	x6, x6, x6
+	eor	x7, x7, x7
+
+    // Y = y4..y0 := 0
+	eor	x8, x8, x8
+	eor	x9, x9, x9
+	eor	x10, x10, x10
+	eor	x11, x11, x11
+	eor	x12, x12, x12
+
+Lbeeu_loop:
+    // if B == 0, jump to .Lbeeu_loop_end
+	orr	x14, x25, x26
+	orr	x14, x14, x27
+
+    // reverse the bit order of x25. This is needed for clz after this macro
+	rbit	x15, x25
+
+	orr	x14, x14, x28
+	cbz	x14,Lbeeu_loop_end
+
+
+    // 0 < B < |n|,
+    // 0 < A <= |n|,
+    // (1)      X*a  ==  B   (mod |n|),
+    // (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    // Now divide B by the maximum possible power of two in the
+    // integers, and divide X by the same value mod |n|.
+    // When we're done, (1) still holds.
+
+    // shift := number of trailing 0s in x25
+    // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+	clz	x13, x15
+
+    // If there is no shift, goto shift_A_Y
+	cbz	x13, Lbeeu_shift_A_Y
+
+    // Shift B right by "x13" bits
+	neg	x14, x13
+	lsr	x25, x25, x13
+	lsl	x15, x26, x14
+
+	lsr	x26, x26, x13
+	lsl	x19, x27, x14
+
+	orr	x25, x25, x15
+
+	lsr	x27, x27, x13
+	lsl	x20, x28, x14
+
+	orr	x26, x26, x19
+
+	lsr	x28, x28, x13
+
+	orr	x27, x27, x20
+
+
+    // Shift X right by "x13" bits, adding n whenever X becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+Lbeeu_shift_loop_X:
+	tbz	x3, #0, Lshift1_0
+	adds	x3, x3, x0
+	adcs	x4, x4, x1
+	adcs	x5, x5, x2
+	adcs	x6, x6, x30
+	adc	x7, x7, x14
+Lshift1_0:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x3, x4, x3, #1
+	extr	x4, x5, x4, #1
+	extr	x5, x6, x5, #1
+	extr	x6, x7, x6, #1
+	lsr	x7, x7, #1
+
+	subs	x13, x13, #1
+	bne	Lbeeu_shift_loop_X
+
+    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+    // with the following differences:
+    // - "x13" is set directly to the number of trailing 0s in B
+    //   (using rbit and clz instructions)
+    // - The loop is only used to call SHIFT1(X)
+    //   and x13 is decreased while executing the X loop.
+    // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+
+Lbeeu_shift_A_Y:
+    // Same for A and Y.
+    // Afterwards, (2) still holds.
+    // Reverse the bit order of x21
+    // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
+	rbit	x15, x21
+	clz	x13, x15
+
+    // If there is no shift, goto |B-A|, X+Y update
+	cbz	x13, Lbeeu_update_B_X_or_A_Y
+
+    // Shift A right by "x13" bits
+	neg	x14, x13
+	lsr	x21, x21, x13
+	lsl	x15, x22, x14
+
+	lsr	x22, x22, x13
+	lsl	x19, x23, x14
+
+	orr	x21, x21, x15
+
+	lsr	x23, x23, x13
+	lsl	x20, x24, x14
+
+	orr	x22, x22, x19
+
+	lsr	x24, x24, x13
+
+	orr	x23, x23, x20
+
+
+    // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+Lbeeu_shift_loop_Y:
+	tbz	x8, #0, Lshift1_1
+	adds	x8, x8, x0
+	adcs	x9, x9, x1
+	adcs	x10, x10, x2
+	adcs	x11, x11, x30
+	adc	x12, x12, x14
+Lshift1_1:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x8, x9, x8, #1
+	extr	x9, x10, x9, #1
+	extr	x10, x11, x10, #1
+	extr	x11, x12, x11, #1
+	lsr	x12, x12, #1
+
+	subs	x13, x13, #1
+	bne	Lbeeu_shift_loop_Y
+
+Lbeeu_update_B_X_or_A_Y:
+    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+    //       without taking a sign bit if generated. The lack of a carry would
+    //       indicate a negative result. See, for example,
+    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+	subs	x14, x25, x21
+	sbcs	x15, x26, x22
+	sbcs	x19, x27, x23
+	sbcs	x20, x28, x24
+	bcs	Lbeeu_B_greater_than_A
+
+    // Else A > B =>
+    // A := A - B; Y := Y + X; goto beginning of the loop
+	subs	x21, x21, x25
+	sbcs	x22, x22, x26
+	sbcs	x23, x23, x27
+	sbcs	x24, x24, x28
+
+	adds	x8, x8, x3
+	adcs	x9, x9, x4
+	adcs	x10, x10, x5
+	adcs	x11, x11, x6
+	adc	x12, x12, x7
+	b	Lbeeu_loop
+
+Lbeeu_B_greater_than_A:
+    // Continue with B > A =>
+    // B := B - A; X := X + Y; goto beginning of the loop
+	mov	x25, x14
+	mov	x26, x15
+	mov	x27, x19
+	mov	x28, x20
+
+	adds	x3, x3, x8
+	adcs	x4, x4, x9
+	adcs	x5, x5, x10
+	adcs	x6, x6, x11
+	adc	x7, x7, x12
+	b	Lbeeu_loop
+
+Lbeeu_loop_end:
+    // The Euclid's algorithm loop ends when A == gcd(a,n);
+    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+    // Since (-1)*Y*a == A (mod |n|), Y>0
+    // then out = -Y mod n
+
+    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    // Is A-1 == 0?
+    // If not, fail.
+	sub	x14, x21, #1
+	orr	x14, x14, x22
+	orr	x14, x14, x23
+	orr	x14, x14, x24
+	cbnz	x14, Lbeeu_err
+
+    // If Y>n ==> Y:=Y-n
+Lbeeu_reduction_loop:
+    // x_i := y_i - n_i (X is no longer needed, use it as temp)
+    // (x14 = 0 from above)
+	subs	x3, x8, x0
+	sbcs	x4, x9, x1
+	sbcs	x5, x10, x2
+	sbcs	x6, x11, x30
+	sbcs	x7, x12, x14
+
+    // If result is non-negative (i.e., cs = carry set = no borrow),
+    // y_i := x_i; goto reduce again
+    // else
+    // y_i := y_i; continue
+	csel	x8, x3, x8, cs
+	csel	x9, x4, x9, cs
+	csel	x10, x5, x10, cs
+	csel	x11, x6, x11, cs
+	csel	x12, x7, x12, cs
+	bcs	Lbeeu_reduction_loop
+
+    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+    // out = -Y = n-Y
+	subs	x8, x0, x8
+	sbcs	x9, x1, x9
+	sbcs	x10, x2, x10
+	sbcs	x11, x30, x11
+
+    // Save Y in output (out (x0) was saved on the stack)
+	ldr	x3, [sp,#96]
+	stp	x8, x9, [x3]
+	stp	x10, x11, [x3,#16]
+    // return 1 (success)
+	mov	x0, #1
+	b	Lbeeu_finish
+
+Lbeeu_err:
+    // return 0 (error)
+	eor	x0, x0, x0
+
+Lbeeu_finish:
+    // Restore callee-saved registers, except x0, x2
+	add	sp,x29,#0
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldp	x29,x30,[sp],#112
+
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-apple.S
similarity index 97%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-apple.S
index da8a6632..2becb6f0 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -322,7 +321,6 @@ L$beeu_finish:
 
 
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-linux.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-linux.S
index d6531c7e..23fdd057 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/p256_beeu-x86_64-asm-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -336,7 +335,6 @@ _CET_ENDBR
 
 .size	beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-apple.S
similarity index 90%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-apple.S
index 5fad1be8..e18ec29d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -57,7 +56,6 @@ L$err:
 
 
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-linux.S
similarity index 91%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-linux.S
index 9dc67041..28abe4e1 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rdrand-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/rdrand-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -57,7 +56,6 @@ _CET_ENDBR
 .cfi_endproc	
 .size	CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-apple.S
index 5eedf9cb..297eadc2 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1749,7 +1748,6 @@ L$inc:
 .p2align	6
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-linux.S
index 616aefa8..dfd1858f 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/rsaz-avx2-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/rsaz-avx2-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1749,7 +1748,6 @@ _CET_ENDBR
 .align	64
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha1-586-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-586-apple.S
new file mode 100644
index 00000000..7b75e41c
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-586-apple.S
@@ -0,0 +1,3787 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_sha1_block_data_order_nohw
+.private_extern	_sha1_block_data_order_nohw
+.align	4
+_sha1_block_data_order_nohw:
+L_sha1_block_data_order_nohw_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%ebp
+	movl	24(%esp),%esi
+	movl	28(%esp),%eax
+	subl	$76,%esp
+	shll	$6,%eax
+	addl	%esi,%eax
+	movl	%eax,104(%esp)
+	movl	16(%ebp),%edi
+	jmp	L000loop
+.align	4,0x90
+L000loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%edx,12(%esp)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%edx,28(%esp)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,40(%esp)
+	movl	%edx,44(%esp)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,48(%esp)
+	movl	%ebx,52(%esp)
+	movl	%ecx,56(%esp)
+	movl	%edx,60(%esp)
+	movl	%esi,100(%esp)
+	movl	(%ebp),%eax
+	movl	4(%ebp),%ebx
+	movl	8(%ebp),%ecx
+	movl	12(%ebp),%edx
+	# 00_15 0 
+	movl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+	# 00_15 1 
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	4(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+	# 00_15 2 
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	8(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+	# 00_15 3 
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	12(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+	# 00_15 4 
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	16(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+	# 00_15 5 
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	20(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+	# 00_15 6 
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	24(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+	# 00_15 7 
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	28(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+	# 00_15 8 
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	32(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+	# 00_15 9 
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	36(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+	# 00_15 10 
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	40(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+	# 00_15 11 
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	44(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+	# 00_15 12 
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	48(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+	# 00_15 13 
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	52(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+	# 00_15 14 
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	56(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+	# 00_15 15 
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	60(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+	# 16_19 16 
+	movl	%edi,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	32(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	xorl	%esi,%ebp
+	addl	%ebp,%eax
+	movl	%ecx,%ebp
+	rorl	$2,%edx
+	movl	%ebx,(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+	# 16_19 17 
+	movl	%edx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	36(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	xorl	%edi,%ebp
+	addl	%ebp,%esi
+	movl	%ebx,%ebp
+	rorl	$2,%ecx
+	movl	%eax,4(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+	# 16_19 18 
+	movl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	40(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	movl	%eax,%ebp
+	rorl	$2,%ebx
+	movl	%esi,8(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+	# 16_19 19 
+	movl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	44(%esp),%edi
+	andl	%eax,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	xorl	%ecx,%ebp
+	addl	%ebp,%edx
+	movl	%esi,%ebp
+	rorl	$2,%eax
+	movl	%edi,12(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 20 
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 21 
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 22 
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 23 
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 24 
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 25 
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 26 
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 27 
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 28 
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 29 
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,52(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 30 
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,56(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 31 
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,60(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 32 
+	movl	%esi,%ebp
+	xorl	8(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	4(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 33 
+	movl	%edi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,4(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	8(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 34 
+	movl	%edx,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,8(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	12(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 35 
+	movl	%ecx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,12(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	16(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 36 
+	movl	%ebx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	4(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,16(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	20(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 37 
+	movl	%eax,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	8(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,20(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	24(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 38 
+	movl	%esi,%ebp
+	xorl	32(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	12(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,24(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	28(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 39 
+	movl	%edi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	16(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,28(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	32(%esp),%ebx
+	addl	%ebp,%ecx
+	# 40_59 40 
+	movl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	20(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,32(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	36(%esp),%eax
+	addl	%ebp,%ebx
+	# 40_59 41 
+	movl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	4(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	24(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,36(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	40(%esp),%esi
+	addl	%ebp,%eax
+	# 40_59 42 
+	movl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	8(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	28(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,40(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	44(%esp),%edi
+	addl	%ebp,%esi
+	# 40_59 43 
+	movl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	12(%esp),%edi
+	andl	%eax,%ebp
+	xorl	32(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,44(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	48(%esp),%edx
+	addl	%ebp,%edi
+	# 40_59 44 
+	movl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	16(%esp),%edx
+	andl	%esi,%ebp
+	xorl	36(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,48(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	52(%esp),%ecx
+	addl	%ebp,%edx
+	# 40_59 45 
+	movl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	20(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	40(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,52(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	56(%esp),%ebx
+	addl	%ebp,%ecx
+	# 40_59 46 
+	movl	%edi,%ebp
+	xorl	(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	24(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	44(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,56(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	60(%esp),%eax
+	addl	%ebp,%ebx
+	# 40_59 47 
+	movl	%edx,%ebp
+	xorl	4(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	28(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	48(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,60(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	(%esp),%esi
+	addl	%ebp,%eax
+	# 40_59 48 
+	movl	%ecx,%ebp
+	xorl	8(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	32(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	52(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	4(%esp),%edi
+	addl	%ebp,%esi
+	# 40_59 49 
+	movl	%ebx,%ebp
+	xorl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	36(%esp),%edi
+	andl	%eax,%ebp
+	xorl	56(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,4(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	8(%esp),%edx
+	addl	%ebp,%edi
+	# 40_59 50 
+	movl	%eax,%ebp
+	xorl	16(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	40(%esp),%edx
+	andl	%esi,%ebp
+	xorl	60(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,8(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	12(%esp),%ecx
+	addl	%ebp,%edx
+	# 40_59 51 
+	movl	%esi,%ebp
+	xorl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	44(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,12(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	16(%esp),%ebx
+	addl	%ebp,%ecx
+	# 40_59 52 
+	movl	%edi,%ebp
+	xorl	24(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	48(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	4(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,16(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	20(%esp),%eax
+	addl	%ebp,%ebx
+	# 40_59 53 
+	movl	%edx,%ebp
+	xorl	28(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	52(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	8(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,20(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	24(%esp),%esi
+	addl	%ebp,%eax
+	# 40_59 54 
+	movl	%ecx,%ebp
+	xorl	32(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	56(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	12(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,24(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	28(%esp),%edi
+	addl	%ebp,%esi
+	# 40_59 55 
+	movl	%ebx,%ebp
+	xorl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	60(%esp),%edi
+	andl	%eax,%ebp
+	xorl	16(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,28(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	32(%esp),%edx
+	addl	%ebp,%edi
+	# 40_59 56 
+	movl	%eax,%ebp
+	xorl	40(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	(%esp),%edx
+	andl	%esi,%ebp
+	xorl	20(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,32(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	36(%esp),%ecx
+	addl	%ebp,%edx
+	# 40_59 57 
+	movl	%esi,%ebp
+	xorl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	4(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	24(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,36(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	40(%esp),%ebx
+	addl	%ebp,%ecx
+	# 40_59 58 
+	movl	%edi,%ebp
+	xorl	48(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	8(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	28(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,40(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	44(%esp),%eax
+	addl	%ebp,%ebx
+	# 40_59 59 
+	movl	%edx,%ebp
+	xorl	52(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	12(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	32(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,44(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	48(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 60 
+	movl	%ebx,%ebp
+	xorl	56(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	36(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,48(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	52(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 61 
+	movl	%eax,%ebp
+	xorl	60(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,52(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	56(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 62 
+	movl	%esi,%ebp
+	xorl	(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	24(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,56(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	60(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 63 
+	movl	%edi,%ebp
+	xorl	4(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,60(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 64 
+	movl	%edx,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 65 
+	movl	%ecx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,4(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 66 
+	movl	%ebx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,8(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 67 
+	movl	%eax,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,12(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 68 
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 69 
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 70 
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 71 
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 72 
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 73 
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+	# 20_39 74 
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+	# 20_39 75 
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+	# 20_39 76 
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+	# 20_39 77 
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+	# 20_39 78 
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+	# 20_39 79 
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%edi,%edx,1),%edi
+	addl	%ebp,%edi
+	movl	96(%esp),%ebp
+	movl	100(%esp),%edx
+	addl	(%ebp),%edi
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%eax
+	addl	12(%ebp),%ebx
+	addl	16(%ebp),%ecx
+	movl	%edi,(%ebp)
+	addl	$64,%edx
+	movl	%esi,4(%ebp)
+	cmpl	104(%esp),%edx
+	movl	%eax,8(%ebp)
+	movl	%ecx,%edi
+	movl	%ebx,12(%ebp)
+	movl	%edx,%esi
+	movl	%ecx,16(%ebp)
+	jb	L000loop
+	addl	$76,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_sha1_block_data_order_ssse3
+.private_extern	_sha1_block_data_order_ssse3
+.align	4
+_sha1_block_data_order_ssse3:
+L_sha1_block_data_order_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	L001pic_point
+L001pic_point:
+	popl	%ebp
+	leal	LK_XX_XX-L001pic_point(%ebp),%ebp
+	movdqa	(%ebp),%xmm7
+	movdqa	16(%ebp),%xmm0
+	movdqa	32(%ebp),%xmm1
+	movdqa	48(%ebp),%xmm2
+	movdqa	64(%ebp),%xmm6
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebp
+	movl	28(%esp),%edx
+	movl	%esp,%esi
+	subl	$208,%esp
+	andl	$-64,%esp
+	movdqa	%xmm0,112(%esp)
+	movdqa	%xmm1,128(%esp)
+	movdqa	%xmm2,144(%esp)
+	shll	$6,%edx
+	movdqa	%xmm7,160(%esp)
+	addl	%ebp,%edx
+	movdqa	%xmm6,176(%esp)
+	addl	$64,%ebp
+	movl	%edi,192(%esp)
+	movl	%ebp,196(%esp)
+	movl	%edx,200(%esp)
+	movl	%esi,204(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	16(%edi),%edi
+	movl	%ebx,%esi
+	movdqu	-64(%ebp),%xmm0
+	movdqu	-48(%ebp),%xmm1
+	movdqu	-32(%ebp),%xmm2
+	movdqu	-16(%ebp),%xmm3
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	movdqa	%xmm7,96(%esp)
+.byte	102,15,56,0,222
+	paddd	%xmm7,%xmm0
+	paddd	%xmm7,%xmm1
+	paddd	%xmm7,%xmm2
+	movdqa	%xmm0,(%esp)
+	psubd	%xmm7,%xmm0
+	movdqa	%xmm1,16(%esp)
+	psubd	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	movl	%ecx,%ebp
+	psubd	%xmm7,%xmm2
+	xorl	%edx,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebp,%esi
+	jmp	L002loop
+.align	4,0x90
+L002loop:
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	punpcklqdq	%xmm1,%xmm4
+	movdqa	%xmm3,%xmm6
+	addl	(%esp),%edi
+	xorl	%ecx,%ebx
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm0,64(%esp)
+	roll	$5,%eax
+	addl	%esi,%edi
+	psrldq	$4,%xmm6
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	pxor	%xmm0,%xmm4
+	addl	%eax,%edi
+	rorl	$7,%eax
+	pxor	%xmm2,%xmm6
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	addl	4(%esp),%edx
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm7,48(%esp)
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	movdqa	%xmm4,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	movdqa	%xmm4,%xmm6
+	xorl	%ebx,%esi
+	pslldq	$12,%xmm0
+	paddd	%xmm4,%xmm4
+	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	psrld	$31,%xmm6
+	xorl	%eax,%edi
+	roll	$5,%edx
+	movdqa	%xmm0,%xmm7
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	psrld	$30,%xmm0
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	por	%xmm6,%xmm4
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	addl	12(%esp),%ebx
+	pslld	$2,%xmm7
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	pxor	%xmm0,%xmm4
+	movdqa	96(%esp),%xmm0
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	pxor	%xmm7,%xmm4
+	pshufd	$238,%xmm1,%xmm5
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	punpcklqdq	%xmm2,%xmm5
+	movdqa	%xmm4,%xmm7
+	addl	16(%esp),%eax
+	xorl	%edx,%ecx
+	paddd	%xmm4,%xmm0
+	movdqa	%xmm1,80(%esp)
+	roll	$5,%ebx
+	addl	%esi,%eax
+	psrldq	$4,%xmm7
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	pxor	%xmm1,%xmm5
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	pxor	%xmm3,%xmm7
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	addl	20(%esp),%edi
+	pxor	%xmm7,%xmm5
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm0,(%esp)
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	movdqa	%xmm5,%xmm1
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	rorl	$7,%eax
+	movdqa	%xmm5,%xmm7
+	xorl	%ecx,%esi
+	pslldq	$12,%xmm1
+	paddd	%xmm5,%xmm5
+	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	psrld	$31,%xmm7
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm1,%xmm0
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	psrld	$30,%xmm1
+	addl	%edi,%edx
+	rorl	$7,%edi
+	por	%xmm7,%xmm5
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
+	addl	28(%esp),%ecx
+	pslld	$2,%xmm0
+	xorl	%eax,%edi
+	roll	$5,%edx
+	pxor	%xmm1,%xmm5
+	movdqa	112(%esp),%xmm1
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	pxor	%xmm0,%xmm5
+	pshufd	$238,%xmm2,%xmm6
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	punpcklqdq	%xmm3,%xmm6
+	movdqa	%xmm5,%xmm0
+	addl	32(%esp),%ebx
+	xorl	%edi,%edx
+	paddd	%xmm5,%xmm1
+	movdqa	%xmm2,96(%esp)
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	psrldq	$4,%xmm0
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	pxor	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pxor	%xmm4,%xmm0
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	addl	36(%esp),%eax
+	pxor	%xmm0,%xmm6
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm1,16(%esp)
+	addl	%ebp,%eax
+	andl	%ecx,%esi
+	movdqa	%xmm6,%xmm2
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	movdqa	%xmm6,%xmm0
+	xorl	%edx,%esi
+	pslldq	$12,%xmm2
+	paddd	%xmm6,%xmm6
+	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	psrld	$31,%xmm0
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm1
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	psrld	$30,%xmm2
+	addl	%eax,%edi
+	rorl	$7,%eax
+	por	%xmm0,%xmm6
+	xorl	%ecx,%ebp
+	movdqa	64(%esp),%xmm0
+	movl	%edi,%esi
+	addl	44(%esp),%edx
+	pslld	$2,%xmm1
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	pxor	%xmm2,%xmm6
+	movdqa	112(%esp),%xmm2
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	pxor	%xmm1,%xmm6
+	pshufd	$238,%xmm3,%xmm7
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%esi
+	movl	%edx,%ebp
+	punpcklqdq	%xmm4,%xmm7
+	movdqa	%xmm6,%xmm1
+	addl	48(%esp),%ecx
+	xorl	%eax,%edi
+	paddd	%xmm6,%xmm2
+	movdqa	%xmm3,64(%esp)
+	roll	$5,%edx
+	addl	%esi,%ecx
+	psrldq	$4,%xmm1
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	pxor	%xmm3,%xmm7
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pxor	%xmm5,%xmm1
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	addl	52(%esp),%ebx
+	pxor	%xmm1,%xmm7
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	movdqa	%xmm2,32(%esp)
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	movdqa	%xmm7,%xmm3
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	movdqa	%xmm7,%xmm1
+	xorl	%edi,%esi
+	pslldq	$12,%xmm3
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	psrld	$31,%xmm1
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm2
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	psrld	$30,%xmm3
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	por	%xmm1,%xmm7
+	xorl	%edx,%ebp
+	movdqa	80(%esp),%xmm1
+	movl	%eax,%esi
+	addl	60(%esp),%edi
+	pslld	$2,%xmm2
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	pxor	%xmm3,%xmm7
+	movdqa	112(%esp),%xmm3
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	pxor	%xmm2,%xmm7
+	pshufd	$238,%xmm6,%xmm2
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	rorl	$7,%eax
+	pxor	%xmm4,%xmm0
+	punpcklqdq	%xmm7,%xmm2
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	addl	(%esp),%edx
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,80(%esp)
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm3,%xmm4
+	addl	%esi,%edx
+	paddd	%xmm7,%xmm3
+	andl	%eax,%ebp
+	pxor	%xmm2,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%ebp
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
+	roll	$5,%edx
+	pslld	$2,%xmm0
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	psrld	$30,%xmm2
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	por	%xmm2,%xmm0
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	movdqa	96(%esp),%xmm2
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	12(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	pshufd	$238,%xmm7,%xmm3
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	16(%esp),%edi
+	pxor	%xmm5,%xmm1
+	punpcklqdq	%xmm0,%xmm3
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,96(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	movdqa	%xmm4,%xmm5
+	rorl	$7,%ebx
+	paddd	%xmm0,%xmm4
+	addl	%eax,%edi
+	pxor	%xmm3,%xmm1
+	addl	20(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm1
+	addl	24(%esp),%ecx
+	xorl	%eax,%esi
+	psrld	$30,%xmm3
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	por	%xmm3,%xmm1
+	addl	28(%esp),%ebx
+	xorl	%edi,%ebp
+	movdqa	64(%esp),%xmm3
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	pshufd	$238,%xmm0,%xmm4
+	addl	%ecx,%ebx
+	addl	32(%esp),%eax
+	pxor	%xmm6,%xmm2
+	punpcklqdq	%xmm1,%xmm4
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,64(%esp)
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	movdqa	128(%esp),%xmm6
+	rorl	$7,%ecx
+	paddd	%xmm1,%xmm5
+	addl	%ebx,%eax
+	pxor	%xmm4,%xmm2
+	addl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	pslld	$2,%xmm2
+	addl	40(%esp),%edx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm4
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	por	%xmm4,%xmm2
+	addl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	movdqa	80(%esp),%xmm4
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	pshufd	$238,%xmm1,%xmm5
+	addl	%edx,%ecx
+	addl	48(%esp),%ebx
+	pxor	%xmm7,%xmm3
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,80(%esp)
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	movdqa	%xmm6,%xmm7
+	rorl	$7,%edx
+	paddd	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	pslld	$2,%xmm3
+	addl	56(%esp),%edi
+	xorl	%ecx,%esi
+	psrld	$30,%xmm5
+	movl	%eax,%ebp
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	por	%xmm5,%xmm3
+	addl	60(%esp),%edx
+	xorl	%ebx,%ebp
+	movdqa	96(%esp),%xmm5
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	pshufd	$238,%xmm2,%xmm6
+	addl	%edi,%edx
+	addl	(%esp),%ecx
+	pxor	%xmm0,%xmm4
+	punpcklqdq	%xmm3,%xmm6
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm0,96(%esp)
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	movdqa	%xmm7,%xmm0
+	rorl	$7,%edi
+	paddd	%xmm3,%xmm7
+	addl	%edx,%ecx
+	pxor	%xmm6,%xmm4
+	addl	4(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm7,48(%esp)
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	pslld	$2,%xmm4
+	addl	8(%esp),%eax
+	xorl	%edx,%esi
+	psrld	$30,%xmm6
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	por	%xmm6,%xmm4
+	addl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	movdqa	64(%esp),%xmm6
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	pshufd	$238,%xmm3,%xmm7
+	addl	%eax,%edi
+	addl	16(%esp),%edx
+	pxor	%xmm1,%xmm5
+	punpcklqdq	%xmm4,%xmm7
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm1,64(%esp)
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	movdqa	%xmm0,%xmm1
+	rorl	$7,%eax
+	paddd	%xmm4,%xmm0
+	addl	%edi,%edx
+	pxor	%xmm7,%xmm5
+	addl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm0,(%esp)
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	pslld	$2,%xmm5
+	addl	24(%esp),%ebx
+	xorl	%edi,%esi
+	psrld	$30,%xmm7
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	por	%xmm7,%xmm5
+	addl	28(%esp),%eax
+	movdqa	80(%esp),%xmm7
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	pshufd	$238,%xmm4,%xmm0
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	32(%esp),%edi
+	pxor	%xmm2,%xmm6
+	punpcklqdq	%xmm5,%xmm0
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm2,80(%esp)
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	roll	$5,%eax
+	movdqa	%xmm1,%xmm2
+	addl	%esi,%edi
+	paddd	%xmm5,%xmm1
+	xorl	%ebx,%ebp
+	pxor	%xmm0,%xmm6
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	36(%esp),%edx
+	andl	%ebx,%ebp
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm1,16(%esp)
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	roll	$5,%edi
+	pslld	$2,%xmm6
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	psrld	$30,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	40(%esp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	por	%xmm0,%xmm6
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	movdqa	96(%esp),%xmm0
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	pshufd	$238,%xmm5,%xmm1
+	addl	44(%esp),%ebx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	48(%esp),%eax
+	pxor	%xmm3,%xmm7
+	punpcklqdq	%xmm6,%xmm1
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm3,96(%esp)
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	roll	$5,%ebx
+	movdqa	144(%esp),%xmm3
+	addl	%esi,%eax
+	paddd	%xmm6,%xmm2
+	xorl	%ecx,%ebp
+	pxor	%xmm1,%xmm7
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%esp),%edi
+	andl	%ecx,%ebp
+	movdqa	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	roll	$5,%eax
+	pslld	$2,%xmm7
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	psrld	$30,%xmm1
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	56(%esp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	por	%xmm1,%xmm7
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	movdqa	64(%esp),%xmm1
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	pshufd	$238,%xmm6,%xmm2
+	addl	60(%esp),%ecx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	(%esp),%ebx
+	pxor	%xmm4,%xmm0
+	punpcklqdq	%xmm7,%xmm2
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,64(%esp)
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	roll	$5,%ecx
+	movdqa	%xmm3,%xmm4
+	addl	%esi,%ebx
+	paddd	%xmm7,%xmm3
+	xorl	%edx,%ebp
+	pxor	%xmm2,%xmm0
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	4(%esp),%eax
+	andl	%edx,%ebp
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	pslld	$2,%xmm0
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	psrld	$30,%xmm2
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%esp),%edi
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	por	%xmm2,%xmm0
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	movdqa	80(%esp),%xmm2
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	pshufd	$238,%xmm7,%xmm3
+	addl	12(%esp),%edx
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	16(%esp),%ecx
+	pxor	%xmm5,%xmm1
+	punpcklqdq	%xmm0,%xmm3
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,80(%esp)
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	roll	$5,%edx
+	movdqa	%xmm4,%xmm5
+	addl	%esi,%ecx
+	paddd	%xmm0,%xmm4
+	xorl	%edi,%ebp
+	pxor	%xmm3,%xmm1
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	20(%esp),%ebx
+	andl	%edi,%ebp
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	roll	$5,%ecx
+	pslld	$2,%xmm1
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	psrld	$30,%xmm3
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	24(%esp),%eax
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	por	%xmm3,%xmm1
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	movdqa	96(%esp),%xmm3
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	pshufd	$238,%xmm0,%xmm4
+	addl	28(%esp),%edi
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	32(%esp),%edx
+	pxor	%xmm6,%xmm2
+	punpcklqdq	%xmm1,%xmm4
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,96(%esp)
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	roll	$5,%edi
+	movdqa	%xmm5,%xmm6
+	addl	%esi,%edx
+	paddd	%xmm1,%xmm5
+	xorl	%eax,%ebp
+	pxor	%xmm4,%xmm2
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	36(%esp),%ecx
+	andl	%eax,%ebp
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	pslld	$2,%xmm2
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	psrld	$30,%xmm4
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	40(%esp),%ebx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	por	%xmm4,%xmm2
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	movdqa	64(%esp),%xmm4
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	pshufd	$238,%xmm1,%xmm5
+	addl	44(%esp),%eax
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	addl	48(%esp),%edi
+	pxor	%xmm7,%xmm3
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,64(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	movdqa	%xmm6,%xmm7
+	rorl	$7,%ebx
+	paddd	%xmm2,%xmm6
+	addl	%eax,%edi
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm3
+	addl	56(%esp),%ecx
+	xorl	%eax,%esi
+	psrld	$30,%xmm5
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	por	%xmm5,%xmm3
+	addl	60(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	paddd	%xmm3,%xmm7
+	addl	%ebx,%eax
+	addl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	movdqa	%xmm7,48(%esp)
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	8(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	movl	196(%esp),%ebp
+	cmpl	200(%esp),%ebp
+	je	L003done
+	movdqa	160(%esp),%xmm7
+	movdqa	176(%esp),%xmm6
+	movdqu	(%ebp),%xmm0
+	movdqu	16(%ebp),%xmm1
+	movdqu	32(%ebp),%xmm2
+	movdqu	48(%ebp),%xmm3
+	addl	$64,%ebp
+.byte	102,15,56,0,198
+	movl	%ebp,196(%esp)
+	movdqa	%xmm7,96(%esp)
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+.byte	102,15,56,0,206
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	paddd	%xmm7,%xmm0
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	%xmm0,(%esp)
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	psubd	%xmm7,%xmm0
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+.byte	102,15,56,0,214
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	paddd	%xmm7,%xmm1
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	movdqa	%xmm1,16(%esp)
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	psubd	%xmm7,%xmm1
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+.byte	102,15,56,0,222
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	paddd	%xmm7,%xmm2
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	movdqa	%xmm2,32(%esp)
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	psubd	%xmm7,%xmm2
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%ecx,%ebx
+	movl	%edx,12(%ebp)
+	xorl	%edx,%ebx
+	movl	%edi,16(%ebp)
+	movl	%esi,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	L002loop
+.align	4,0x90
+L003done:
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	movl	204(%esp),%esp
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_sha1_block_data_order_avx
+.private_extern	_sha1_block_data_order_avx
+.align	4
+_sha1_block_data_order_avx:
+L_sha1_block_data_order_avx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	L004pic_point
+L004pic_point:
+	popl	%ebp
+	leal	LK_XX_XX-L004pic_point(%ebp),%ebp
+	vzeroall
+	vmovdqa	(%ebp),%xmm7
+	vmovdqa	16(%ebp),%xmm0
+	vmovdqa	32(%ebp),%xmm1
+	vmovdqa	48(%ebp),%xmm2
+	vmovdqa	64(%ebp),%xmm6
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebp
+	movl	28(%esp),%edx
+	movl	%esp,%esi
+	subl	$208,%esp
+	andl	$-64,%esp
+	vmovdqa	%xmm0,112(%esp)
+	vmovdqa	%xmm1,128(%esp)
+	vmovdqa	%xmm2,144(%esp)
+	shll	$6,%edx
+	vmovdqa	%xmm7,160(%esp)
+	addl	%ebp,%edx
+	vmovdqa	%xmm6,176(%esp)
+	addl	$64,%ebp
+	movl	%edi,192(%esp)
+	movl	%ebp,196(%esp)
+	movl	%edx,200(%esp)
+	movl	%esi,204(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	16(%edi),%edi
+	movl	%ebx,%esi
+	vmovdqu	-64(%ebp),%xmm0
+	vmovdqu	-48(%ebp),%xmm1
+	vmovdqu	-32(%ebp),%xmm2
+	vmovdqu	-16(%ebp),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	vpshufb	%xmm6,%xmm1,%xmm1
+	vpshufb	%xmm6,%xmm2,%xmm2
+	vmovdqa	%xmm7,96(%esp)
+	vpshufb	%xmm6,%xmm3,%xmm3
+	vpaddd	%xmm7,%xmm0,%xmm4
+	vpaddd	%xmm7,%xmm1,%xmm5
+	vpaddd	%xmm7,%xmm2,%xmm6
+	vmovdqa	%xmm4,(%esp)
+	movl	%ecx,%ebp
+	vmovdqa	%xmm5,16(%esp)
+	xorl	%edx,%ebp
+	vmovdqa	%xmm6,32(%esp)
+	andl	%ebp,%esi
+	jmp	L005loop
+.align	4,0x90
+L005loop:
+	shrdl	$2,%ebx,%ebx
+	xorl	%edx,%esi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	movl	%eax,%ebp
+	addl	(%esp),%edi
+	vpaddd	%xmm3,%xmm7,%xmm7
+	vmovdqa	%xmm0,64(%esp)
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrldq	$4,%xmm3,%xmm6
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm7,48(%esp)
+	movl	%edi,%esi
+	addl	4(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	vpsrld	$31,%xmm4,%xmm6
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
+	vpslldq	$12,%xmm4,%xmm0
+	vpaddd	%xmm4,%xmm4,%xmm4
+	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm0,%xmm7
+	vpor	%xmm6,%xmm4,%xmm4
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpslld	$2,%xmm0,%xmm0
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
+	vpxor	%xmm7,%xmm4,%xmm4
+	movl	%ecx,%esi
+	addl	12(%esp),%ebx
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	vmovdqa	96(%esp),%xmm0
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	movl	%ebx,%ebp
+	addl	16(%esp),%eax
+	vpaddd	%xmm4,%xmm0,%xmm0
+	vmovdqa	%xmm1,80(%esp)
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrldq	$4,%xmm4,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	vpxor	%xmm1,%xmm5,%xmm5
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm3,%xmm7,%xmm7
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
+	vmovdqa	%xmm0,(%esp)
+	movl	%eax,%esi
+	addl	20(%esp),%edi
+	vpxor	%xmm7,%xmm5,%xmm5
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	vpsrld	$31,%xmm5,%xmm7
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	vpslldq	$12,%xmm5,%xmm1
+	vpaddd	%xmm5,%xmm5,%xmm5
+	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm1,%xmm0
+	vpor	%xmm7,%xmm5,%xmm5
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm1,%xmm1
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
+	vpxor	%xmm0,%xmm5,%xmm5
+	movl	%edx,%esi
+	addl	28(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	vmovdqa	112(%esp),%xmm1
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	movl	%ecx,%ebp
+	addl	32(%esp),%ebx
+	vpaddd	%xmm5,%xmm1,%xmm1
+	vmovdqa	%xmm2,96(%esp)
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vpsrldq	$4,%xmm5,%xmm0
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	vpxor	%xmm2,%xmm6,%xmm6
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%ebp
+	vmovdqa	%xmm1,16(%esp)
+	movl	%ebx,%esi
+	addl	36(%esp),%eax
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	andl	%ecx,%esi
+	vpsrld	$31,%xmm6,%xmm0
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%esi
+	vpslldq	$12,%xmm6,%xmm2
+	vpaddd	%xmm6,%xmm6,%xmm6
+	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm1
+	vpor	%xmm0,%xmm6,%xmm6
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpslld	$2,%xmm2,%xmm2
+	vmovdqa	64(%esp),%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
+	vpxor	%xmm1,%xmm6,%xmm6
+	movl	%edi,%esi
+	addl	44(%esp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	vmovdqa	112(%esp),%xmm2
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	movl	%edx,%ebp
+	addl	48(%esp),%ecx
+	vpaddd	%xmm6,%xmm2,%xmm2
+	vmovdqa	%xmm3,64(%esp)
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpsrldq	$4,%xmm6,%xmm1
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpxor	%xmm5,%xmm1,%xmm1
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
+	vmovdqa	%xmm2,32(%esp)
+	movl	%ecx,%esi
+	addl	52(%esp),%ebx
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	vpsrld	$31,%xmm7,%xmm1
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$12,%xmm7,%xmm3
+	vpaddd	%xmm7,%xmm7,%xmm7
+	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm2
+	vpor	%xmm1,%xmm7,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	vmovdqa	80(%esp),%xmm1
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
+	vpxor	%xmm2,%xmm7,%xmm7
+	movl	%eax,%esi
+	addl	60(%esp),%edi
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	vmovdqa	112(%esp),%xmm3
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm6,%xmm7,%xmm2
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	addl	(%esp),%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	vmovdqa	%xmm4,80(%esp)
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vmovdqa	%xmm3,%xmm4
+	vpaddd	%xmm7,%xmm3,%xmm3
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	vpxor	%xmm2,%xmm0,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
+	vpsrld	$30,%xmm0,%xmm2
+	vmovdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpslld	$2,%xmm0,%xmm0
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	vpor	%xmm2,%xmm0,%xmm0
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vmovdqa	96(%esp),%xmm2
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	12(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm7,%xmm0,%xmm3
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm5,96(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm4,%xmm5
+	vpaddd	%xmm0,%xmm4,%xmm4
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm3,%xmm1,%xmm1
+	addl	20(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm1,%xmm3
+	vmovdqa	%xmm4,(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm1,%xmm1
+	addl	24(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpor	%xmm3,%xmm1,%xmm1
+	addl	28(%esp),%ebx
+	xorl	%edi,%ebp
+	vmovdqa	64(%esp),%xmm3
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm3,%xmm2,%xmm2
+	vmovdqa	%xmm6,64(%esp)
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	vmovdqa	128(%esp),%xmm6
+	vpaddd	%xmm1,%xmm5,%xmm5
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm4,%xmm2,%xmm2
+	addl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm4
+	vmovdqa	%xmm5,16(%esp)
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpslld	$2,%xmm2,%xmm2
+	addl	40(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpor	%xmm4,%xmm2,%xmm2
+	addl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	vmovdqa	80(%esp),%xmm4
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm4,%xmm3,%xmm3
+	vmovdqa	%xmm7,80(%esp)
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	vmovdqa	%xmm6,%xmm7
+	vpaddd	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm5,%xmm3,%xmm3
+	addl	52(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm5
+	vmovdqa	%xmm6,32(%esp)
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpor	%xmm5,%xmm3,%xmm3
+	addl	60(%esp),%edx
+	xorl	%ebx,%ebp
+	vmovdqa	96(%esp),%xmm5
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	vpxor	%xmm5,%xmm4,%xmm4
+	vmovdqa	%xmm0,96(%esp)
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	vmovdqa	%xmm7,%xmm0
+	vpaddd	%xmm3,%xmm7,%xmm7
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpxor	%xmm6,%xmm4,%xmm4
+	addl	4(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpsrld	$30,%xmm4,%xmm6
+	vmovdqa	%xmm7,48(%esp)
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpslld	$2,%xmm4,%xmm4
+	addl	8(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpor	%xmm6,%xmm4,%xmm4
+	addl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	vmovdqa	64(%esp),%xmm6
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	16(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	vpxor	%xmm6,%xmm5,%xmm5
+	vmovdqa	%xmm1,64(%esp)
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	vmovdqa	%xmm0,%xmm1
+	vpaddd	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpxor	%xmm7,%xmm5,%xmm5
+	addl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm5,%xmm7
+	vmovdqa	%xmm0,(%esp)
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpslld	$2,%xmm5,%xmm5
+	addl	24(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpor	%xmm7,%xmm5,%xmm5
+	addl	28(%esp),%eax
+	vmovdqa	80(%esp),%xmm7
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm4,%xmm5,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	32(%esp),%edi
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovdqa	%xmm2,80(%esp)
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	vmovdqa	%xmm1,%xmm2
+	vpaddd	%xmm5,%xmm1,%xmm1
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	36(%esp),%edx
+	vpsrld	$30,%xmm6,%xmm0
+	vmovdqa	%xmm1,16(%esp)
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%edi,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%ebp
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	40(%esp),%ecx
+	andl	%eax,%esi
+	vpor	%xmm0,%xmm6,%xmm6
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	vmovdqa	96(%esp),%xmm0
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	44(%esp),%ebx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm5,%xmm6,%xmm1
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	48(%esp),%eax
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	vpxor	%xmm0,%xmm7,%xmm7
+	vmovdqa	%xmm3,96(%esp)
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	vmovdqa	144(%esp),%xmm3
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%esp),%edi
+	vpsrld	$30,%xmm7,%xmm1
+	vmovdqa	%xmm2,32(%esp)
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%ebp
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	56(%esp),%edx
+	andl	%ebx,%esi
+	vpor	%xmm1,%xmm7,%xmm7
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vmovdqa	64(%esp),%xmm1
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	60(%esp),%ecx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm6,%xmm7,%xmm2
+	vpxor	%xmm4,%xmm0,%xmm0
+	addl	(%esp),%ebx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	vmovdqa	%xmm4,64(%esp)
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	vmovdqa	%xmm3,%xmm4
+	vpaddd	%xmm7,%xmm3,%xmm3
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	vpxor	%xmm2,%xmm0,%xmm0
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	4(%esp),%eax
+	vpsrld	$30,%xmm0,%xmm2
+	vmovdqa	%xmm3,48(%esp)
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%esp),%edi
+	andl	%ecx,%esi
+	vpor	%xmm2,%xmm0,%xmm0
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vmovdqa	80(%esp),%xmm2
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	12(%esp),%edx
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	vpalignr	$8,%xmm7,%xmm0,%xmm3
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%esp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm5,80(%esp)
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	vmovdqa	%xmm4,%xmm5
+	vpaddd	%xmm0,%xmm4,%xmm4
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	vpxor	%xmm3,%xmm1,%xmm1
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	20(%esp),%ebx
+	vpsrld	$30,%xmm1,%xmm3
+	vmovdqa	%xmm4,(%esp)
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%edi,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	24(%esp),%eax
+	andl	%edx,%esi
+	vpor	%xmm3,%xmm1,%xmm1
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	vmovdqa	96(%esp),%xmm3
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	28(%esp),%edi
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%esp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vpxor	%xmm3,%xmm2,%xmm2
+	vmovdqa	%xmm6,96(%esp)
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	vmovdqa	%xmm5,%xmm6
+	vpaddd	%xmm1,%xmm5,%xmm5
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	vpxor	%xmm4,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	36(%esp),%ecx
+	vpsrld	$30,%xmm2,%xmm4
+	vmovdqa	%xmm5,16(%esp)
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	40(%esp),%ebx
+	andl	%edi,%esi
+	vpor	%xmm4,%xmm2,%xmm2
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	vmovdqa	64(%esp),%xmm4
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	44(%esp),%eax
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	vpxor	%xmm4,%xmm3,%xmm3
+	vmovdqa	%xmm7,64(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm6,%xmm7
+	vpaddd	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm5,%xmm3,%xmm3
+	addl	52(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm3,%xmm5
+	vmovdqa	%xmm6,32(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpor	%xmm5,%xmm3,%xmm3
+	addl	60(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	(%esp),%eax
+	vpaddd	%xmm3,%xmm7,%xmm7
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vmovdqa	%xmm7,48(%esp)
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	8(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	movl	196(%esp),%ebp
+	cmpl	200(%esp),%ebp
+	je	L006done
+	vmovdqa	160(%esp),%xmm7
+	vmovdqa	176(%esp),%xmm6
+	vmovdqu	(%ebp),%xmm0
+	vmovdqu	16(%ebp),%xmm1
+	vmovdqu	32(%ebp),%xmm2
+	vmovdqu	48(%ebp),%xmm3
+	addl	$64,%ebp
+	vpshufb	%xmm6,%xmm0,%xmm0
+	movl	%ebp,196(%esp)
+	vmovdqa	%xmm7,96(%esp)
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	vpshufb	%xmm6,%xmm1,%xmm1
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	vpaddd	%xmm7,%xmm0,%xmm4
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vmovdqa	%xmm4,(%esp)
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	vpshufb	%xmm6,%xmm2,%xmm2
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm7,%xmm1,%xmm5
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vmovdqa	%xmm5,16(%esp)
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	vpshufb	%xmm6,%xmm3,%xmm3
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	vpaddd	%xmm7,%xmm2,%xmm6
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vmovdqa	%xmm6,32(%esp)
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,%ebx
+	movl	%ecx,8(%ebp)
+	xorl	%edx,%ebx
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	movl	%esi,%ebp
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	L005loop
+.align	4,0x90
+L006done:
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vzeroall
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	movl	204(%esp),%esp
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	6,0x90
+LK_XX_XX:
+.long	1518500249,1518500249,1518500249,1518500249
+.long	1859775393,1859775393,1859775393,1859775393
+.long	2400959708,2400959708,2400959708,2400959708
+.long	3395469782,3395469782,3395469782,3395469782
+.long	66051,67438087,134810123,202182159
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+.byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+.byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+.byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-586-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-586-linux.S
index 11b6cb2f..0778a6fc 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-586-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-586-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -3788,7 +3787,6 @@ sha1_block_data_order_avx:
 .byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
 .byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv4-large-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-linux.linux.arm.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-armv4-large-linux.S
index f9bb4730..aea08091 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv4-large-linux.linux.arm.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv4-large-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1481,7 +1480,6 @@ sha1_block_data_order_hw:
 .size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
-#endif  // defined(__arm__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-apple.S
index 21c39904..ea3e5513 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1218,7 +1217,6 @@ Lconst:
 .align	2
 .align	2
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-linux.S
index 296b18c6..4afb8cfd 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1218,7 +1217,6 @@ sha1_block_data_order_hw:
 .align	2
 .align	2
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-win.S
new file mode 100644
index 00000000..5635fdee
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-armv8-win.S
@@ -0,0 +1,1227 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <CCryptoBoringSSL_arm_arch.h>
+
+.text
+
+.globl	sha1_block_data_order_nohw
+
+.def sha1_block_data_order_nohw
+   .type 32
+.endef
+.align	6
+sha1_block_data_order_nohw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	w20,w21,[x0]
+	ldp	w22,w23,[x0,#8]
+	ldr	w24,[x0,#16]
+
+Loop:
+	ldr	x3,[x1],#64
+	movz	w28,#0x7999
+	sub	x2,x2,#1
+	movk	w28,#0x5a82,lsl#16
+#ifdef	__AARCH64EB__
+	ror	x3,x3,#32
+#else
+	rev32	x3,x3
+#endif
+	add	w24,w24,w28		// warm it up
+	add	w24,w24,w3
+	lsr	x4,x3,#32
+	ldr	x5,[x1,#-56]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w4	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x5,x5,#32
+#else
+	rev32	x5,x5
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w5	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x6,x5,#32
+	ldr	x7,[x1,#-48]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w6	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x7,x7,#32
+#else
+	rev32	x7,x7
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w7	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x8,x7,#32
+	ldr	x9,[x1,#-40]
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w8	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x9,x9,#32
+#else
+	rev32	x9,x9
+#endif
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w9	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	lsr	x10,x9,#32
+	ldr	x11,[x1,#-32]
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w10	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x11,x11,#32
+#else
+	rev32	x11,x11
+#endif
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w11	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	lsr	x12,x11,#32
+	ldr	x13,[x1,#-24]
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w12	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x13,x13,#32
+#else
+	rev32	x13,x13
+#endif
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w13	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	lsr	x14,x13,#32
+	ldr	x15,[x1,#-16]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w14	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x15,x15,#32
+#else
+	rev32	x15,x15
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w15	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x16,x15,#32
+	ldr	x17,[x1,#-8]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w16	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x17,x17,#32
+#else
+	rev32	x17,x17
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w17	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x19,x17,#32
+	eor	w3,w3,w5
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w3,w3,w11
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w3,w3,w16
+	ror	w22,w22,#2
+	add	w24,w24,w19	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	eor	w4,w4,w12
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	eor	w4,w4,w17
+	ror	w21,w21,#2
+	add	w23,w23,w3	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	eor	w5,w5,w13
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	eor	w5,w5,w19
+	ror	w20,w20,#2
+	add	w22,w22,w4	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	eor	w6,w6,w14
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	eor	w6,w6,w3
+	ror	w24,w24,#2
+	add	w21,w21,w5	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	eor	w7,w7,w15
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	eor	w7,w7,w4
+	ror	w23,w23,#2
+	add	w20,w20,w6	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	movz	w28,#0xeba1
+	movk	w28,#0x6ed9,lsl#16
+	eor	w8,w8,w10
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w8,w8,w16
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w8,w8,w5
+	ror	w22,w22,#2
+	add	w24,w24,w7	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w9,w9,w6
+	add	w23,w23,w8	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w10,w10,w7
+	add	w22,w22,w9	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w11,w11,w8
+	add	w21,w21,w10	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w12,w12,w9
+	add	w20,w20,w11	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w13,w13,w10
+	add	w24,w24,w12	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w14,w14,w11
+	add	w23,w23,w13	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w15,w15,w12
+	add	w22,w22,w14	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w16,w16,w13
+	add	w21,w21,w15	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w17,w17,w14
+	add	w20,w20,w16	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w19,w19,w15
+	add	w24,w24,w17	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w3,w3,w16
+	add	w23,w23,w19	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w4,w4,w17
+	add	w22,w22,w3	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w5,w5,w19
+	add	w21,w21,w4	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w6,w6,w3
+	add	w20,w20,w5	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w7,w7,w4
+	add	w24,w24,w6	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w8,w8,w5
+	add	w23,w23,w7	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w9,w9,w6
+	add	w22,w22,w8	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w10,w10,w7
+	add	w21,w21,w9	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w11,w11,w8
+	add	w20,w20,w10	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	movz	w28,#0xbcdc
+	movk	w28,#0x8f1b,lsl#16
+	eor	w12,w12,w14
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w12,w12,w9
+	add	w24,w24,w11	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w13,w13,w15
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w13,w13,w10
+	add	w23,w23,w12	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w14,w14,w16
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w14,w14,w11
+	add	w22,w22,w13	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w15,w15,w17
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w15,w15,w12
+	add	w21,w21,w14	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w16,w16,w19
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w16,w16,w13
+	add	w20,w20,w15	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w17,w17,w3
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w17,w17,w9
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w17,w17,w14
+	add	w24,w24,w16	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w19,w19,w4
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w19,w19,w10
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w19,w19,w15
+	add	w23,w23,w17	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w3,w3,w5
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w3,w3,w11
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w3,w3,w16
+	add	w22,w22,w19	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w4,w4,w6
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w4,w4,w12
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w4,w4,w17
+	add	w21,w21,w3	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w5,w5,w7
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w5,w5,w13
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w5,w5,w19
+	add	w20,w20,w4	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w6,w6,w8
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w6,w6,w14
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w6,w6,w3
+	add	w24,w24,w5	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w7,w7,w9
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w7,w7,w15
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w7,w7,w4
+	add	w23,w23,w6	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w8,w8,w10
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w8,w8,w16
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w8,w8,w5
+	add	w22,w22,w7	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w9,w9,w11
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w9,w9,w17
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w9,w9,w6
+	add	w21,w21,w8	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w10,w10,w12
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w10,w10,w19
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w10,w10,w7
+	add	w20,w20,w9	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w11,w11,w13
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w11,w11,w3
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w11,w11,w8
+	add	w24,w24,w10	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w12,w12,w14
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w12,w12,w4
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w12,w12,w9
+	add	w23,w23,w11	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w13,w13,w15
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w13,w13,w10
+	add	w22,w22,w12	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w14,w14,w16
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w14,w14,w11
+	add	w21,w21,w13	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w15,w15,w17
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w15,w15,w12
+	add	w20,w20,w14	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	movz	w28,#0xc1d6
+	movk	w28,#0xca62,lsl#16
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w16,w16,w19
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w16,w16,w13
+	add	w24,w24,w15	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w17,w17,w14
+	add	w23,w23,w16	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w19,w19,w15
+	add	w22,w22,w17	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w3,w3,w16
+	add	w21,w21,w19	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w4,w4,w17
+	add	w20,w20,w3	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w5,w5,w19
+	add	w24,w24,w4	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w6,w6,w3
+	add	w23,w23,w5	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w7,w7,w4
+	add	w22,w22,w6	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w8,w8,w5
+	add	w21,w21,w7	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w9,w9,w6
+	add	w20,w20,w8	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w10,w10,w7
+	add	w24,w24,w9	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w11,w11,w8
+	add	w23,w23,w10	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w12,w12,w9
+	add	w22,w22,w11	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w13,w13,w10
+	add	w21,w21,w12	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w14,w14,w11
+	add	w20,w20,w13	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w15,w15,w12
+	add	w24,w24,w14	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w16,w16,w13
+	add	w23,w23,w15	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w17,w17,w14
+	add	w22,w22,w16	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w19,w19,w15
+	add	w21,w21,w17	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	ldp	w4,w5,[x0]
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w19	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ldp	w6,w7,[x0,#8]
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	ldr	w8,[x0,#16]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	add	w21,w21,w5
+	add	w22,w22,w6
+	add	w20,w20,w4
+	add	w23,w23,w7
+	add	w24,w24,w8
+	stp	w20,w21,[x0]
+	stp	w22,w23,[x0,#8]
+	str	w24,[x0,#16]
+	cbnz	x2,Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+
+.globl	sha1_block_data_order_hw
+
+.def sha1_block_data_order_hw
+   .type 32
+.endef
+.align	6
+sha1_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adrp	x4,Lconst
+	add	x4,x4,:lo12:Lconst
+	eor	v1.16b,v1.16b,v1.16b
+	ld1	{v0.4s},[x0],#16
+	ld1	{v1.s}[0],[x0]
+	sub	x0,x0,#16
+	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+
+	add	v20.4s,v16.4s,v4.4s
+	rev32	v6.16b,v6.16b
+	orr	v22.16b,v0.16b,v0.16b	// offload
+
+	add	v21.4s,v16.4s,v5.4s
+	rev32	v7.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b
+.long	0x5e140020	//sha1c v0.16b,v1.16b,v20.4s		// 0
+	add	v20.4s,v16.4s,v6.4s
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 1
+.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v16.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 2
+.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v16.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 3
+.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 4
+.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 5
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 6
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 7
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 8
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 9
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 10
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 11
+.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 12
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 13
+.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 14
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 15
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 16
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 17
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 18
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 19
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+
+	add	v1.4s,v1.4s,v2.4s
+	add	v0.4s,v0.4s,v22.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s},[x0],#16
+	st1	{v1.s}[0],[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+.section	.rodata
+.align	6
+Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-apple.S
index 7ae9b669..a3391770 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -5450,7 +5449,6 @@ K_XX_XX:
 .p2align	6
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-linux.S
index 912b74c5..fc01d95f 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha1-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha1-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -5450,7 +5449,6 @@ K_XX_XX:
 .align	64
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha256-586-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-586-apple.S
new file mode 100644
index 00000000..9447fcd3
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-586-apple.S
@@ -0,0 +1,5598 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_sha256_block_data_order_nohw
+.private_extern	_sha256_block_data_order_nohw
+.align	4
+_sha256_block_data_order_nohw:
+L_sha256_block_data_order_nohw_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	L000pic_point
+L000pic_point:
+	popl	%ebp
+	leal	LK256-L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+L001no_xmm:
+	subl	%edi,%eax
+	cmpl	$256,%eax
+	jae	L002unrolled
+	jmp	L003loop
+.align	4,0x90
+L003loop:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	bswap	%eax
+	movl	12(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	bswap	%eax
+	movl	28(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	bswap	%eax
+	movl	44(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	bswap	%eax
+	movl	60(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	addl	$64,%edi
+	leal	-36(%esp),%esp
+	movl	%edi,104(%esp)
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,8(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,12(%esp)
+	movl	%edi,16(%esp)
+	movl	%ebx,(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edi
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	movl	%edi,32(%esp)
+.align	4,0x90
+L00400_15:
+	movl	%edx,%ecx
+	movl	24(%esp),%esi
+	rorl	$14,%ecx
+	movl	28(%esp),%edi
+	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	96(%esp),%ebx
+	rorl	$5,%ecx
+	andl	%edx,%esi
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
+	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3248222580,%esi
+	jne	L00400_15
+	movl	156(%esp),%ecx
+	jmp	L00516_63
+.align	4,0x90
+L00516_63:
+	movl	%ecx,%ebx
+	movl	104(%esp),%esi
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	160(%esp),%ebx
+	shrl	$10,%edi
+	addl	124(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	24(%esp),%esi
+	rorl	$14,%ecx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,96(%esp)
+	rorl	$5,%ecx
+	andl	%edx,%esi
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
+	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	movl	156(%esp),%ecx
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3329325298,%esi
+	jne	L00516_63
+	movl	356(%esp),%esi
+	movl	8(%esp),%ebx
+	movl	16(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	24(%esp),%eax
+	movl	28(%esp),%ebx
+	movl	32(%esp),%ecx
+	movl	360(%esp),%edi
+	addl	16(%esi),%edx
+	addl	20(%esi),%eax
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%eax,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	leal	356(%esp),%esp
+	subl	$256,%ebp
+	cmpl	8(%esp),%edi
+	jb	L003loop
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	6,0x90
+LK256:
+.long	1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long	66051,67438087,134810123,202182159
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+.align	4,0x90
+L002unrolled:
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebp
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebx
+	movl	%ebp,4(%esp)
+	xorl	%ecx,%ebp
+	movl	%ecx,8(%esp)
+	movl	%ebx,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	jmp	L006grand_loop
+.align	4,0x90
+L006grand_loop:
+	movl	(%edi),%ebx
+	movl	4(%edi),%ecx
+	bswap	%ebx
+	movl	8(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,32(%esp)
+	bswap	%esi
+	movl	%ecx,36(%esp)
+	movl	%esi,40(%esp)
+	movl	12(%edi),%ebx
+	movl	16(%edi),%ecx
+	bswap	%ebx
+	movl	20(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,44(%esp)
+	bswap	%esi
+	movl	%ecx,48(%esp)
+	movl	%esi,52(%esp)
+	movl	24(%edi),%ebx
+	movl	28(%edi),%ecx
+	bswap	%ebx
+	movl	32(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,56(%esp)
+	bswap	%esi
+	movl	%ecx,60(%esp)
+	movl	%esi,64(%esp)
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	bswap	%ebx
+	movl	44(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,68(%esp)
+	bswap	%esi
+	movl	%ecx,72(%esp)
+	movl	%esi,76(%esp)
+	movl	48(%edi),%ebx
+	movl	52(%edi),%ecx
+	bswap	%ebx
+	movl	56(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,80(%esp)
+	bswap	%esi
+	movl	%ecx,84(%esp)
+	movl	%esi,88(%esp)
+	movl	60(%edi),%ebx
+	addl	$64,%edi
+	bswap	%ebx
+	movl	%edi,100(%esp)
+	movl	%ebx,92(%esp)
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1116352408(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	36(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1899447441(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	40(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3049323471(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	44(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3921009573(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	48(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	961987163(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	52(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1508970993(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	56(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2453635748(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	60(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2870763221(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	64(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3624381080(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	68(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	310598401(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	72(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	607225278(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	76(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1426881987(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	80(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1925078388(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	84(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2162078206(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	88(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2614888103(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	92(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3248222580(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3835390401(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	4022224774(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	264347078(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	604807628(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	770255983(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1249150122(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1555081692(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1996064986(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2554220882(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2821834349(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2952996808(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3210313671(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3336571891(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3584528711(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	113926993(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,92(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	338241895(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	666307205(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	773529912(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1294757372(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1396182291(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1695183700(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1986661051(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2177026350(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2456956037(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2730485921(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2820302411(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3259730800(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3345764771(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3516065817(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3600352804(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	4094571909(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,92(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	275423344(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	430227734(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	506948616(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	659060556(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	883997877(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	958139571(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1322822218(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1537002063(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1747873779(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1955562222(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2024104815(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2227730452(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2361852424(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2428436474(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2756734187(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3204031479(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3329325298(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebp
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebp
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebp,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	cmpl	104(%esp),%edi
+	jb	L006grand_loop
+	movl	108(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_sha256_block_data_order_ssse3
+.private_extern	_sha256_block_data_order_ssse3
+.align	4
+_sha256_block_data_order_ssse3:
+L_sha256_block_data_order_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	L007pic_point
+L007pic_point:
+	popl	%ebp
+	leal	LK256-L007pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	movdqa	256(%ebp),%xmm7
+	jmp	L008grand_ssse3
+.align	4,0x90
+L008grand_ssse3:
+	movdqu	(%edi),%xmm0
+	movdqu	16(%edi),%xmm1
+	movdqu	32(%edi),%xmm2
+	movdqu	48(%edi),%xmm3
+	addl	$64,%edi
+.byte	102,15,56,0,199
+	movl	%edi,100(%esp)
+.byte	102,15,56,0,207
+	movdqa	(%ebp),%xmm4
+.byte	102,15,56,0,215
+	movdqa	16(%ebp),%xmm5
+	paddd	%xmm0,%xmm4
+.byte	102,15,56,0,223
+	movdqa	32(%ebp),%xmm6
+	paddd	%xmm1,%xmm5
+	movdqa	48(%ebp),%xmm7
+	movdqa	%xmm4,32(%esp)
+	paddd	%xmm2,%xmm6
+	movdqa	%xmm5,48(%esp)
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm6,64(%esp)
+	movdqa	%xmm7,80(%esp)
+	jmp	L009ssse3_00_47
+.align	4,0x90
+L009ssse3_00_47:
+	addl	$64,%ebp
+	movl	%edx,%ecx
+	movdqa	%xmm1,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,224,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,250,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm3,%xmm7
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm0
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm0,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,32(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm2,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,225,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,251,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm0,%xmm7
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm1
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	16(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm1,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,48(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm3,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,226,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,248,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm1,%xmm7
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm2
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	32(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm2,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,64(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm0,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,227,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,249,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm2,%xmm7
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm3
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	48(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm3,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	L009ssse3_00_47
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	movdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	L008grand_ssse3
+	movl	108(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_sha256_block_data_order_avx
+.private_extern	_sha256_block_data_order_avx
+.align	4
+_sha256_block_data_order_avx:
+L_sha256_block_data_order_avx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	L010pic_point
+L010pic_point:
+	popl	%ebp
+	leal	LK256-L010pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	leal	-96(%esp),%esp
+	vzeroall
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	vmovdqa	256(%ebp),%xmm7
+	jmp	L011grand_avx
+.align	5,0x90
+L011grand_avx:
+	vmovdqu	(%edi),%xmm0
+	vmovdqu	16(%edi),%xmm1
+	vmovdqu	32(%edi),%xmm2
+	vmovdqu	48(%edi),%xmm3
+	addl	$64,%edi
+	vpshufb	%xmm7,%xmm0,%xmm0
+	movl	%edi,100(%esp)
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	(%ebp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	16(%ebp),%xmm1,%xmm5
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	vpaddd	48(%ebp),%xmm3,%xmm7
+	vmovdqa	%xmm4,32(%esp)
+	vmovdqa	%xmm5,48(%esp)
+	vmovdqa	%xmm6,64(%esp)
+	vmovdqa	%xmm7,80(%esp)
+	jmp	L012avx_00_47
+.align	4,0x90
+L012avx_00_47:
+	addl	$64,%ebp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm0,%xmm0
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm0,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm0,%xmm0
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	(%ebp),%xmm0,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,32(%esp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm1,%xmm1
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm1,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm1,%xmm1
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	16(%ebp),%xmm1,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,48(%esp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm2,%xmm2
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm2,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm2,%xmm2
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,64(%esp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm3,%xmm3
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm3,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm3,%xmm3
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	48(%ebp),%xmm3,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	L012avx_00_47
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	vmovdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	L011grand_avx
+	movl	108(%esp),%esp
+	vzeroall
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-586-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-586-linux.S
index 79af683c..4bc31d1d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-586-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-586-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -5599,7 +5598,6 @@ sha256_block_data_order_avx:
 	ret
 .size	sha256_block_data_order_avx,.-.L_sha256_block_data_order_avx_begin
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv4-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-linux.linux.arm.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-armv4-linux.S
index 8e9f5522..0b3ba432 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv4-linux.linux.arm.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv4-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -2839,7 +2838,6 @@ sha256_block_data_order_hw:
 .align	2
 .align	2
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
-#endif  // defined(__arm__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-apple.S
index 341f867e..c7a89712 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1193,7 +1192,6 @@ Loop_hw:
 
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-linux.S
index a2b88efb..99d3a3f7 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1193,7 +1192,6 @@ sha256_block_data_order_hw:
 .size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-win.S
new file mode 100644
index 00000000..beb18820
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-armv8-win.S
@@ -0,0 +1,1202 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <CCryptoBoringSSL_arm_arch.h>
+#endif
+
+.text
+
+.globl	sha256_block_data_order_nohw
+
+.def sha256_block_data_order_nohw
+   .type 32
+.endef
+.align	6
+sha256_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*4
+
+	ldp	w20,w21,[x0]				// load context
+	ldp	w22,w23,[x0,#2*4]
+	ldp	w24,w25,[x0,#4*4]
+	add	x2,x1,x2,lsl#6	// end of input
+	ldp	w26,w27,[x0,#6*4]
+	adrp	x30,LK256
+	add	x30,x30,:lo12:LK256
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	w3,w4,[x1],#2*4
+	ldr	w19,[x30],#4			// *K++
+	eor	w28,w21,w22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	w3,w3			// 0
+#endif
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w6,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w3			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w4,w4			// 1
+#endif
+	ldp	w5,w6,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w7,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w4			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w5,w5			// 2
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w8,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w5			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w6,w6			// 3
+#endif
+	ldp	w7,w8,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w9,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w6			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w7,w7			// 4
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w10,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w7			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w10,ror#11	// Sigma1(e)
+	ror	w10,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w10,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w8,w8			// 5
+#endif
+	ldp	w9,w10,[x1],#2*4
+	add	w23,w23,w17			// h+=Sigma0(a)
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w11,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w8			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w11,ror#11	// Sigma1(e)
+	ror	w11,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w11,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w9,w9			// 6
+#endif
+	add	w22,w22,w17			// h+=Sigma0(a)
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w12,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w9			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w12,ror#11	// Sigma1(e)
+	ror	w12,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w12,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w10,w10			// 7
+#endif
+	ldp	w11,w12,[x1],#2*4
+	add	w21,w21,w17			// h+=Sigma0(a)
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	eor	w13,w25,w25,ror#14
+	and	w17,w26,w25
+	bic	w28,w27,w25
+	add	w20,w20,w10			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w13,ror#11	// Sigma1(e)
+	ror	w13,w21,#2
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	eor	w17,w21,w21,ror#9
+	add	w20,w20,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w24,w24,w20			// d+=h
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w13,w17,ror#13	// Sigma0(a)
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w20,w20,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w11,w11			// 8
+#endif
+	add	w20,w20,w17			// h+=Sigma0(a)
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w14,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w11			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w14,ror#11	// Sigma1(e)
+	ror	w14,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w14,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w12,w12			// 9
+#endif
+	ldp	w13,w14,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w15,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w12			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w15,ror#11	// Sigma1(e)
+	ror	w15,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w15,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w13,w13			// 10
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w0,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w13			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w0,ror#11	// Sigma1(e)
+	ror	w0,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w0,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w14,w14			// 11
+#endif
+	ldp	w15,w0,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w6,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w14			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w15,w15			// 12
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w7,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w15			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w0,w0			// 13
+#endif
+	ldp	w1,w2,[x1]
+	add	w23,w23,w17			// h+=Sigma0(a)
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w8,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w0			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w1,w1			// 14
+#endif
+	ldr	w6,[sp,#12]
+	add	w22,w22,w17			// h+=Sigma0(a)
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w9,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w1			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w2,w2			// 15
+#endif
+	ldr	w7,[sp,#0]
+	add	w21,w21,w17			// h+=Sigma0(a)
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+Loop_16_xx:
+	ldr	w8,[sp,#4]
+	str	w11,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w10,w5,#7
+	and	w17,w25,w24
+	ror	w9,w2,#17
+	bic	w19,w26,w24
+	ror	w11,w20,#2
+	add	w27,w27,w3			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w10,w10,w5,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w11,w11,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w9,w9,w2,ror#19
+	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w11,w20,ror#22	// Sigma0(a)
+	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
+	add	w4,w4,w13
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w4,w4,w10
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w4,w4,w9
+	ldr	w9,[sp,#8]
+	str	w12,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w11,w6,#7
+	and	w17,w24,w23
+	ror	w10,w3,#17
+	bic	w28,w25,w23
+	ror	w12,w27,#2
+	add	w26,w26,w4			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w11,w11,w6,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w12,w12,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w10,w10,w3,ror#19
+	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w12,w27,ror#22	// Sigma0(a)
+	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
+	add	w5,w5,w14
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w5,w5,w11
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w5,w5,w10
+	ldr	w10,[sp,#12]
+	str	w13,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w12,w7,#7
+	and	w17,w23,w22
+	ror	w11,w4,#17
+	bic	w19,w24,w22
+	ror	w13,w26,#2
+	add	w25,w25,w5			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w12,w12,w7,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w13,w13,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w11,w11,w4,ror#19
+	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w13,w26,ror#22	// Sigma0(a)
+	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
+	add	w6,w6,w15
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w6,w6,w12
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w6,w6,w11
+	ldr	w11,[sp,#0]
+	str	w14,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w13,w8,#7
+	and	w17,w22,w21
+	ror	w12,w5,#17
+	bic	w28,w23,w21
+	ror	w14,w25,#2
+	add	w24,w24,w6			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w13,w13,w8,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w14,w14,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w12,w12,w5,ror#19
+	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w14,w25,ror#22	// Sigma0(a)
+	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
+	add	w7,w7,w0
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w7,w7,w13
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w7,w7,w12
+	ldr	w12,[sp,#4]
+	str	w15,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w14,w9,#7
+	and	w17,w21,w20
+	ror	w13,w6,#17
+	bic	w19,w22,w20
+	ror	w15,w24,#2
+	add	w23,w23,w7			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w14,w14,w9,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w15,w15,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w13,w13,w6,ror#19
+	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w15,w24,ror#22	// Sigma0(a)
+	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
+	add	w8,w8,w1
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w8,w8,w14
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w8,w8,w13
+	ldr	w13,[sp,#8]
+	str	w0,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w15,w10,#7
+	and	w17,w20,w27
+	ror	w14,w7,#17
+	bic	w28,w21,w27
+	ror	w0,w23,#2
+	add	w22,w22,w8			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w15,w15,w10,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w0,w0,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w14,w14,w7,ror#19
+	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w0,w23,ror#22	// Sigma0(a)
+	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
+	add	w9,w9,w2
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w9,w9,w15
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w9,w9,w14
+	ldr	w14,[sp,#12]
+	str	w1,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w0,w11,#7
+	and	w17,w27,w26
+	ror	w15,w8,#17
+	bic	w19,w20,w26
+	ror	w1,w22,#2
+	add	w21,w21,w9			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w0,w0,w11,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w1,w1,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w15,w15,w8,ror#19
+	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w1,w22,ror#22	// Sigma0(a)
+	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
+	add	w10,w10,w3
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w10,w10,w0
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w10,w10,w15
+	ldr	w15,[sp,#0]
+	str	w2,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w1,w12,#7
+	and	w17,w26,w25
+	ror	w0,w9,#17
+	bic	w28,w27,w25
+	ror	w2,w21,#2
+	add	w20,w20,w10			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w1,w1,w12,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w2,w2,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w0,w0,w9,ror#19
+	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w2,w21,ror#22	// Sigma0(a)
+	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
+	add	w11,w11,w4
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w11,w11,w1
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w11,w11,w0
+	ldr	w0,[sp,#4]
+	str	w3,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w2,w13,#7
+	and	w17,w25,w24
+	ror	w1,w10,#17
+	bic	w19,w26,w24
+	ror	w3,w20,#2
+	add	w27,w27,w11			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w2,w2,w13,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w3,w3,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w1,w1,w10,ror#19
+	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w3,w20,ror#22	// Sigma0(a)
+	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
+	add	w12,w12,w5
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w12,w12,w2
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w12,w12,w1
+	ldr	w1,[sp,#8]
+	str	w4,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w3,w14,#7
+	and	w17,w24,w23
+	ror	w2,w11,#17
+	bic	w28,w25,w23
+	ror	w4,w27,#2
+	add	w26,w26,w12			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w3,w3,w14,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w4,w4,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w2,w2,w11,ror#19
+	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w4,w27,ror#22	// Sigma0(a)
+	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
+	add	w13,w13,w6
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w13,w13,w3
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w13,w13,w2
+	ldr	w2,[sp,#12]
+	str	w5,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w4,w15,#7
+	and	w17,w23,w22
+	ror	w3,w12,#17
+	bic	w19,w24,w22
+	ror	w5,w26,#2
+	add	w25,w25,w13			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w4,w4,w15,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w5,w5,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w3,w3,w12,ror#19
+	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w5,w26,ror#22	// Sigma0(a)
+	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
+	add	w14,w14,w7
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w14,w14,w4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w14,w14,w3
+	ldr	w3,[sp,#0]
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w5,w0,#7
+	and	w17,w22,w21
+	ror	w4,w13,#17
+	bic	w28,w23,w21
+	ror	w6,w25,#2
+	add	w24,w24,w14			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w5,w5,w0,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w6,w6,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w4,w4,w13,ror#19
+	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w25,ror#22	// Sigma0(a)
+	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
+	add	w15,w15,w8
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w15,w15,w5
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w15,w15,w4
+	ldr	w4,[sp,#4]
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w6,w1,#7
+	and	w17,w21,w20
+	ror	w5,w14,#17
+	bic	w19,w22,w20
+	ror	w7,w24,#2
+	add	w23,w23,w15			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w6,w6,w1,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w7,w7,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w5,w5,w14,ror#19
+	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w24,ror#22	// Sigma0(a)
+	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
+	add	w0,w0,w9
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w0,w0,w6
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w0,w0,w5
+	ldr	w5,[sp,#8]
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w7,w2,#7
+	and	w17,w20,w27
+	ror	w6,w15,#17
+	bic	w28,w21,w27
+	ror	w8,w23,#2
+	add	w22,w22,w0			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w7,w7,w2,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w8,w8,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w6,w6,w15,ror#19
+	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w23,ror#22	// Sigma0(a)
+	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
+	add	w1,w1,w10
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w1,w1,w7
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w1,w1,w6
+	ldr	w6,[sp,#12]
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w8,w3,#7
+	and	w17,w27,w26
+	ror	w7,w0,#17
+	bic	w19,w20,w26
+	ror	w9,w22,#2
+	add	w21,w21,w1			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w8,w8,w3,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w9,w9,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w7,w7,w0,ror#19
+	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w22,ror#22	// Sigma0(a)
+	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
+	add	w2,w2,w11
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w2,w2,w8
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w2,w2,w7
+	ldr	w7,[sp,#0]
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+	cbnz	w19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#260		// rewind
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#2*4]
+	add	x1,x1,#14*4			// advance input pointer
+	ldp	w7,w8,[x0,#4*4]
+	add	w20,w20,w3
+	ldp	w9,w10,[x0,#6*4]
+	add	w21,w21,w4
+	add	w22,w22,w5
+	add	w23,w23,w6
+	stp	w20,w21,[x0]
+	add	w24,w24,w7
+	add	w25,w25,w8
+	stp	w22,w23,[x0,#2*4]
+	add	w26,w26,w9
+	add	w27,w27,w10
+	cmp	x1,x2
+	stp	w24,w25,[x0,#4*4]
+	stp	w26,w27,[x0,#6*4]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*4
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	.rodata
+.align	6
+
+LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	sha256_block_data_order_hw
+
+.def sha256_block_data_order_hw
+   .type 32
+.endef
+.align	6
+sha256_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adrp	x3,LK256
+	add	x3,x3,:lo12:LK256
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-apple.S
index 8bb9c47d..d67f925c 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -4170,7 +4169,6 @@ L$epilogue_avx:
 
 
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-linux.S
index 749b6bea..1023d3c4 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha256-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha256-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -4170,7 +4169,6 @@ _CET_ENDBR
 .cfi_endproc	
 .size	sha256_block_data_order_avx,.-sha256_block_data_order_avx
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha512-586-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-586-apple.S
new file mode 100644
index 00000000..b7e6b86b
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-586-apple.S
@@ -0,0 +1,2411 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_sha512_block_data_order_nohw
+.private_extern	_sha512_block_data_order_nohw
+.align	4
+_sha512_block_data_order_nohw:
+L_sha512_block_data_order_nohw_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	L000pic_point
+L000pic_point:
+	popl	%ebp
+	leal	LK512-L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$7,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm1
+	movq	16(%esi),%mm2
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
+	subl	$80,%esp
+	jmp	L001loop_sse2
+.align	4,0x90
+L001loop_sse2:
+	movq	%mm1,8(%esp)
+	movq	%mm2,16(%esp)
+	movq	%mm3,24(%esp)
+	movq	%mm5,40(%esp)
+	movq	%mm6,48(%esp)
+	pxor	%mm1,%mm2
+	movq	%mm7,56(%esp)
+	movq	%mm0,%mm3
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	addl	$8,%edi
+	movl	$15,%edx
+	bswap	%eax
+	bswap	%ebx
+	jmp	L00200_14_sse2
+.align	4,0x90
+L00200_14_sse2:
+	movd	%eax,%mm1
+	movl	(%edi),%eax
+	movd	%ebx,%mm7
+	movl	4(%edi),%ebx
+	addl	$8,%edi
+	bswap	%eax
+	bswap	%ebx
+	punpckldq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
+	movq	48(%esp),%mm6
+	decl	%edx
+	jnz	L00200_14_sse2
+	movd	%eax,%mm1
+	movd	%ebx,%mm7
+	punpckldq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
+	pxor	%mm0,%mm0
+	movl	$32,%edx
+	jmp	L00316_79_sse2
+.align	4,0x90
+L00316_79_sse2:
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm0
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm2
+	addl	$8,%ebp
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm2
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm0
+	addl	$8,%ebp
+	decl	%edx
+	jnz	L00316_79_sse2
+	paddq	%mm3,%mm0
+	movq	8(%esp),%mm1
+	movq	24(%esp),%mm3
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movl	$640,%eax
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	leal	(%esp,%eax,1),%esp
+	subl	%eax,%ebp
+	cmpl	88(%esp),%edi
+	jb	L001loop_sse2
+	movl	92(%esp),%esp
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_sha512_block_data_order_ssse3
+.private_extern	_sha512_block_data_order_ssse3
+.align	4
+_sha512_block_data_order_ssse3:
+L_sha512_block_data_order_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	L004pic_point
+L004pic_point:
+	popl	%ebp
+	leal	LK512-L004pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$7,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm1
+	movq	16(%esi),%mm2
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
+	leal	-64(%esp),%edx
+	subl	$256,%esp
+	movdqa	640(%ebp),%xmm1
+	movdqu	(%edi),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%edi),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%edi),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%edi),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%edi),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%edi),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%edi),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%edi),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movdqa	%xmm2,-16(%edx)
+	nop
+.align	5,0x90
+L005loop_ssse3:
+	movdqa	16(%edx),%xmm2
+	movdqa	%xmm3,48(%edx)
+	leal	128(%ebp),%ebp
+	movq	%mm1,8(%esp)
+	movl	%edi,%ebx
+	movq	%mm2,16(%esp)
+	leal	128(%edi),%edi
+	movq	%mm3,24(%esp)
+	cmpl	%eax,%edi
+	movq	%mm5,40(%esp)
+	cmovbl	%edi,%ebx
+	movq	%mm6,48(%esp)
+	movl	$4,%ecx
+	pxor	%mm1,%mm2
+	movq	%mm7,56(%esp)
+	pxor	%mm3,%mm3
+	jmp	L00600_47_ssse3
+.align	5,0x90
+L00600_47_ssse3:
+	movdqa	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm1
+.byte	102,15,58,15,208,8
+	movdqa	%xmm4,(%edx)
+.byte	102,15,58,15,220,8
+	movdqa	%xmm2,%xmm4
+	psrlq	$7,%xmm2
+	paddq	%xmm3,%xmm0
+	movdqa	%xmm4,%xmm3
+	psrlq	$1,%xmm4
+	psllq	$56,%xmm3
+	pxor	%xmm4,%xmm2
+	psrlq	$7,%xmm4
+	pxor	%xmm3,%xmm2
+	psllq	$7,%xmm3
+	pxor	%xmm4,%xmm2
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm7,%xmm3
+	psrlq	$6,%xmm4
+	paddq	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm2
+	psrlq	$19,%xmm3
+	psllq	$3,%xmm2
+	pxor	%xmm3,%xmm4
+	psrlq	$42,%xmm3
+	pxor	%xmm2,%xmm4
+	psllq	$42,%xmm2
+	pxor	%xmm3,%xmm4
+	movdqa	32(%edx),%xmm3
+	pxor	%xmm2,%xmm4
+	movdqa	(%ebp),%xmm2
+	movq	%mm4,%mm1
+	paddq	%xmm4,%xmm0
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	paddq	%xmm0,%xmm2
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm2,-128(%edx)
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm2
+.byte	102,15,58,15,217,8
+	movdqa	%xmm5,16(%edx)
+.byte	102,15,58,15,229,8
+	movdqa	%xmm3,%xmm5
+	psrlq	$7,%xmm3
+	paddq	%xmm4,%xmm1
+	movdqa	%xmm5,%xmm4
+	psrlq	$1,%xmm5
+	psllq	$56,%xmm4
+	pxor	%xmm5,%xmm3
+	psrlq	$7,%xmm5
+	pxor	%xmm4,%xmm3
+	psllq	$7,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm0,%xmm5
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$6,%xmm5
+	paddq	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm3
+	psrlq	$19,%xmm4
+	psllq	$3,%xmm3
+	pxor	%xmm4,%xmm5
+	psrlq	$42,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$42,%xmm3
+	pxor	%xmm4,%xmm5
+	movdqa	48(%edx),%xmm4
+	pxor	%xmm3,%xmm5
+	movdqa	16(%ebp),%xmm3
+	movq	%mm4,%mm1
+	paddq	%xmm5,%xmm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm1,%xmm3
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm3,-112(%edx)
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm3
+.byte	102,15,58,15,226,8
+	movdqa	%xmm6,32(%edx)
+.byte	102,15,58,15,238,8
+	movdqa	%xmm4,%xmm6
+	psrlq	$7,%xmm4
+	paddq	%xmm5,%xmm2
+	movdqa	%xmm6,%xmm5
+	psrlq	$1,%xmm6
+	psllq	$56,%xmm5
+	pxor	%xmm6,%xmm4
+	psrlq	$7,%xmm6
+	pxor	%xmm5,%xmm4
+	psllq	$7,%xmm5
+	pxor	%xmm6,%xmm4
+	movdqa	%xmm1,%xmm6
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm1,%xmm5
+	psrlq	$6,%xmm6
+	paddq	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm4
+	psrlq	$19,%xmm5
+	psllq	$3,%xmm4
+	pxor	%xmm5,%xmm6
+	psrlq	$42,%xmm5
+	pxor	%xmm4,%xmm6
+	psllq	$42,%xmm4
+	pxor	%xmm5,%xmm6
+	movdqa	(%edx),%xmm5
+	pxor	%xmm4,%xmm6
+	movdqa	32(%ebp),%xmm4
+	movq	%mm4,%mm1
+	paddq	%xmm6,%xmm2
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm2,%xmm4
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm4,-96(%edx)
+	movdqa	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm4
+.byte	102,15,58,15,235,8
+	movdqa	%xmm7,48(%edx)
+.byte	102,15,58,15,247,8
+	movdqa	%xmm5,%xmm7
+	psrlq	$7,%xmm5
+	paddq	%xmm6,%xmm3
+	movdqa	%xmm7,%xmm6
+	psrlq	$1,%xmm7
+	psllq	$56,%xmm6
+	pxor	%xmm7,%xmm5
+	psrlq	$7,%xmm7
+	pxor	%xmm6,%xmm5
+	psllq	$7,%xmm6
+	pxor	%xmm7,%xmm5
+	movdqa	%xmm2,%xmm7
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm2,%xmm6
+	psrlq	$6,%xmm7
+	paddq	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm5
+	psrlq	$19,%xmm6
+	psllq	$3,%xmm5
+	pxor	%xmm6,%xmm7
+	psrlq	$42,%xmm6
+	pxor	%xmm5,%xmm7
+	psllq	$42,%xmm5
+	pxor	%xmm6,%xmm7
+	movdqa	16(%edx),%xmm6
+	pxor	%xmm5,%xmm7
+	movdqa	48(%ebp),%xmm5
+	movq	%mm4,%mm1
+	paddq	%xmm7,%xmm3
+	movq	-80(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm3,%xmm5
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm5,-80(%edx)
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm5
+.byte	102,15,58,15,244,8
+	movdqa	%xmm0,(%edx)
+.byte	102,15,58,15,248,8
+	movdqa	%xmm6,%xmm0
+	psrlq	$7,%xmm6
+	paddq	%xmm7,%xmm4
+	movdqa	%xmm0,%xmm7
+	psrlq	$1,%xmm0
+	psllq	$56,%xmm7
+	pxor	%xmm0,%xmm6
+	psrlq	$7,%xmm0
+	pxor	%xmm7,%xmm6
+	psllq	$7,%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm3,%xmm0
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm3,%xmm7
+	psrlq	$6,%xmm0
+	paddq	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm6
+	psrlq	$19,%xmm7
+	psllq	$3,%xmm6
+	pxor	%xmm7,%xmm0
+	psrlq	$42,%xmm7
+	pxor	%xmm6,%xmm0
+	psllq	$42,%xmm6
+	pxor	%xmm7,%xmm0
+	movdqa	32(%edx),%xmm7
+	pxor	%xmm6,%xmm0
+	movdqa	64(%ebp),%xmm6
+	movq	%mm4,%mm1
+	paddq	%xmm0,%xmm4
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	paddq	%xmm4,%xmm6
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm6,-64(%edx)
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm6
+.byte	102,15,58,15,253,8
+	movdqa	%xmm1,16(%edx)
+.byte	102,15,58,15,193,8
+	movdqa	%xmm7,%xmm1
+	psrlq	$7,%xmm7
+	paddq	%xmm0,%xmm5
+	movdqa	%xmm1,%xmm0
+	psrlq	$1,%xmm1
+	psllq	$56,%xmm0
+	pxor	%xmm1,%xmm7
+	psrlq	$7,%xmm1
+	pxor	%xmm0,%xmm7
+	psllq	$7,%xmm0
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm4,%xmm1
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm4,%xmm0
+	psrlq	$6,%xmm1
+	paddq	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm7
+	psrlq	$19,%xmm0
+	psllq	$3,%xmm7
+	pxor	%xmm0,%xmm1
+	psrlq	$42,%xmm0
+	pxor	%xmm7,%xmm1
+	psllq	$42,%xmm7
+	pxor	%xmm0,%xmm1
+	movdqa	48(%edx),%xmm0
+	pxor	%xmm7,%xmm1
+	movdqa	80(%ebp),%xmm7
+	movq	%mm4,%mm1
+	paddq	%xmm1,%xmm5
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm5,%xmm7
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm7,-48(%edx)
+	movdqa	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm7
+.byte	102,15,58,15,198,8
+	movdqa	%xmm2,32(%edx)
+.byte	102,15,58,15,202,8
+	movdqa	%xmm0,%xmm2
+	psrlq	$7,%xmm0
+	paddq	%xmm1,%xmm6
+	movdqa	%xmm2,%xmm1
+	psrlq	$1,%xmm2
+	psllq	$56,%xmm1
+	pxor	%xmm2,%xmm0
+	psrlq	$7,%xmm2
+	pxor	%xmm1,%xmm0
+	psllq	$7,%xmm1
+	pxor	%xmm2,%xmm0
+	movdqa	%xmm5,%xmm2
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm1
+	psrlq	$6,%xmm2
+	paddq	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm0
+	psrlq	$19,%xmm1
+	psllq	$3,%xmm0
+	pxor	%xmm1,%xmm2
+	psrlq	$42,%xmm1
+	pxor	%xmm0,%xmm2
+	psllq	$42,%xmm0
+	pxor	%xmm1,%xmm2
+	movdqa	(%edx),%xmm1
+	pxor	%xmm0,%xmm2
+	movdqa	96(%ebp),%xmm0
+	movq	%mm4,%mm1
+	paddq	%xmm2,%xmm6
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm6,%xmm0
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm0,-32(%edx)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm0
+.byte	102,15,58,15,207,8
+	movdqa	%xmm3,48(%edx)
+.byte	102,15,58,15,211,8
+	movdqa	%xmm1,%xmm3
+	psrlq	$7,%xmm1
+	paddq	%xmm2,%xmm7
+	movdqa	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	psllq	$56,%xmm2
+	pxor	%xmm3,%xmm1
+	psrlq	$7,%xmm3
+	pxor	%xmm2,%xmm1
+	psllq	$7,%xmm2
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm6,%xmm3
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm6,%xmm2
+	psrlq	$6,%xmm3
+	paddq	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrlq	$19,%xmm2
+	psllq	$3,%xmm1
+	pxor	%xmm2,%xmm3
+	psrlq	$42,%xmm2
+	pxor	%xmm1,%xmm3
+	psllq	$42,%xmm1
+	pxor	%xmm2,%xmm3
+	movdqa	16(%edx),%xmm2
+	pxor	%xmm1,%xmm3
+	movdqa	112(%ebp),%xmm1
+	movq	%mm4,%mm1
+	paddq	%xmm3,%xmm7
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm7,%xmm1
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm1,-16(%edx)
+	leal	128(%ebp),%ebp
+	decl	%ecx
+	jnz	L00600_47_ssse3
+	movdqa	(%ebp),%xmm1
+	leal	-640(%ebp),%ebp
+	movdqu	(%ebx),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%ebx),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movq	%mm4,%mm1
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%ebx),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movq	%mm4,%mm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%ebx),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movq	%mm4,%mm1
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%ebx),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movq	%mm4,%mm1
+	movq	-80(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%ebx),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
+	movq	%mm4,%mm1
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%ebx),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movq	%mm4,%mm1
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%ebx),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movq	%mm4,%mm1
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movq	%mm4,%mm1
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm2,-16(%edx)
+	movq	8(%esp),%mm1
+	paddq	%mm3,%mm0
+	movq	24(%esp),%mm3
+	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	cmpl	%eax,%edi
+	jb	L005loop_ssse3
+	movl	76(%edx),%esp
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	6,0x90
+LK512:
+.long	3609767458,1116352408
+.long	602891725,1899447441
+.long	3964484399,3049323471
+.long	2173295548,3921009573
+.long	4081628472,961987163
+.long	3053834265,1508970993
+.long	2937671579,2453635748
+.long	3664609560,2870763221
+.long	2734883394,3624381080
+.long	1164996542,310598401
+.long	1323610764,607225278
+.long	3590304994,1426881987
+.long	4068182383,1925078388
+.long	991336113,2162078206
+.long	633803317,2614888103
+.long	3479774868,3248222580
+.long	2666613458,3835390401
+.long	944711139,4022224774
+.long	2341262773,264347078
+.long	2007800933,604807628
+.long	1495990901,770255983
+.long	1856431235,1249150122
+.long	3175218132,1555081692
+.long	2198950837,1996064986
+.long	3999719339,2554220882
+.long	766784016,2821834349
+.long	2566594879,2952996808
+.long	3203337956,3210313671
+.long	1034457026,3336571891
+.long	2466948901,3584528711
+.long	3758326383,113926993
+.long	168717936,338241895
+.long	1188179964,666307205
+.long	1546045734,773529912
+.long	1522805485,1294757372
+.long	2643833823,1396182291
+.long	2343527390,1695183700
+.long	1014477480,1986661051
+.long	1206759142,2177026350
+.long	344077627,2456956037
+.long	1290863460,2730485921
+.long	3158454273,2820302411
+.long	3505952657,3259730800
+.long	106217008,3345764771
+.long	3606008344,3516065817
+.long	1432725776,3600352804
+.long	1467031594,4094571909
+.long	851169720,275423344
+.long	3100823752,430227734
+.long	1363258195,506948616
+.long	3750685593,659060556
+.long	3785050280,883997877
+.long	3318307427,958139571
+.long	3812723403,1322822218
+.long	2003034995,1537002063
+.long	3602036899,1747873779
+.long	1575990012,1955562222
+.long	1125592928,2024104815
+.long	2716904306,2227730452
+.long	442776044,2361852424
+.long	593698344,2428436474
+.long	3733110249,2756734187
+.long	2999351573,3204031479
+.long	3815920427,3329325298
+.long	3928383900,3391569614
+.long	566280711,3515267271
+.long	3454069534,3940187606
+.long	4000239992,4118630271
+.long	1914138554,116418474
+.long	2731055270,174292421
+.long	3203993006,289380356
+.long	320620315,460393269
+.long	587496836,685471733
+.long	1086792851,852142971
+.long	365543100,1017036298
+.long	2618297676,1126000580
+.long	3409855158,1288033470
+.long	4234509866,1501505948
+.long	987167468,1607167915
+.long	1246189591,1816402316
+.long	67438087,66051
+.long	202182159,134810123
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-586-linux.S
similarity index 83%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-586-linux.S
index aa1ada92..cdd4b413 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-586-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-586-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -7,12 +6,12 @@
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
 .text
-.globl	sha512_block_data_order
-.hidden	sha512_block_data_order
-.type	sha512_block_data_order,@function
+.globl	sha512_block_data_order_nohw
+.hidden	sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,@function
 .align	16
-sha512_block_data_order:
-.L_sha512_block_data_order_begin:
+sha512_block_data_order_nohw:
+.L_sha512_block_data_order_nohw_begin:
 	pushl	%ebp
 	pushl	%ebx
 	pushl	%esi
@@ -24,7 +23,7 @@ sha512_block_data_order:
 	call	.L000pic_point
 .L000pic_point:
 	popl	%ebp
-	leal	.L001K512-.L000pic_point(%ebp),%ebp
+	leal	.LK512-.L000pic_point(%ebp),%ebp
 	subl	$16,%esp
 	andl	$-64,%esp
 	shll	$7,%eax
@@ -33,28 +32,18 @@ sha512_block_data_order:
 	movl	%edi,4(%esp)
 	movl	%eax,8(%esp)
 	movl	%ebx,12(%esp)
-	leal	OPENSSL_ia32cap_P-.L001K512(%ebp),%edx
-	movl	(%edx),%ecx
-	testl	$67108864,%ecx
-	jz	.L002loop_x86
-	movl	4(%edx),%edx
 	movq	(%esi),%mm0
-	andl	$16777216,%ecx
 	movq	8(%esi),%mm1
-	andl	$512,%edx
 	movq	16(%esi),%mm2
-	orl	%edx,%ecx
 	movq	24(%esi),%mm3
 	movq	32(%esi),%mm4
 	movq	40(%esi),%mm5
 	movq	48(%esi),%mm6
 	movq	56(%esi),%mm7
-	cmpl	$16777728,%ecx
-	je	.L003SSSE3
 	subl	$80,%esp
-	jmp	.L004loop_sse2
+	jmp	.L001loop_sse2
 .align	16
-.L004loop_sse2:
+.L001loop_sse2:
 	movq	%mm1,8(%esp)
 	movq	%mm2,16(%esp)
 	movq	%mm3,24(%esp)
@@ -69,9 +58,9 @@ sha512_block_data_order:
 	movl	$15,%edx
 	bswap	%eax
 	bswap	%ebx
-	jmp	.L00500_14_sse2
+	jmp	.L00200_14_sse2
 .align	16
-.L00500_14_sse2:
+.L00200_14_sse2:
 	movd	%eax,%mm1
 	movl	(%edi),%eax
 	movd	%ebx,%mm7
@@ -132,7 +121,7 @@ sha512_block_data_order:
 	paddq	%mm6,%mm3
 	movq	48(%esp),%mm6
 	decl	%edx
-	jnz	.L00500_14_sse2
+	jnz	.L00200_14_sse2
 	movd	%eax,%mm1
 	movd	%ebx,%mm7
 	punpckldq	%mm1,%mm7
@@ -188,9 +177,9 @@ sha512_block_data_order:
 	paddq	%mm6,%mm3
 	pxor	%mm0,%mm0
 	movl	$32,%edx
-	jmp	.L00616_79_sse2
+	jmp	.L00316_79_sse2
 .align	16
-.L00616_79_sse2:
+.L00316_79_sse2:
 	movq	88(%esp),%mm5
 	movq	%mm7,%mm1
 	psrlq	$1,%mm7
@@ -344,7 +333,7 @@ sha512_block_data_order:
 	paddq	%mm6,%mm0
 	addl	$8,%ebp
 	decl	%edx
-	jnz	.L00616_79_sse2
+	jnz	.L00316_79_sse2
 	paddq	%mm3,%mm0
 	movq	8(%esp),%mm1
 	movq	24(%esp),%mm3
@@ -372,7 +361,7 @@ sha512_block_data_order:
 	leal	(%esp,%eax,1),%esp
 	subl	%eax,%ebp
 	cmpl	88(%esp),%edi
-	jb	.L004loop_sse2
+	jb	.L001loop_sse2
 	movl	92(%esp),%esp
 	emms
 	popl	%edi
@@ -380,8 +369,41 @@ sha512_block_data_order:
 	popl	%ebx
 	popl	%ebp
 	ret
-.align	32
-.L003SSSE3:
+.size	sha512_block_data_order_nohw,.-.L_sha512_block_data_order_nohw_begin
+.globl	sha512_block_data_order_ssse3
+.hidden	sha512_block_data_order_ssse3
+.type	sha512_block_data_order_ssse3,@function
+.align	16
+sha512_block_data_order_ssse3:
+.L_sha512_block_data_order_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L004pic_point
+.L004pic_point:
+	popl	%ebp
+	leal	.LK512-.L004pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$7,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm1
+	movq	16(%esi),%mm2
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
 	leal	-64(%esp),%edx
 	subl	$256,%esp
 	movdqa	640(%ebp),%xmm1
@@ -438,7 +460,7 @@ sha512_block_data_order:
 	movdqa	%xmm2,-16(%edx)
 	nop
 .align	32
-.L007loop_ssse3:
+.L005loop_ssse3:
 	movdqa	16(%edx),%xmm2
 	movdqa	%xmm3,48(%edx)
 	leal	128(%ebp),%ebp
@@ -455,9 +477,9 @@ sha512_block_data_order:
 	pxor	%mm1,%mm2
 	movq	%mm7,56(%esp)
 	pxor	%mm3,%mm3
-	jmp	.L00800_47_ssse3
+	jmp	.L00600_47_ssse3
 .align	32
-.L00800_47_ssse3:
+.L00600_47_ssse3:
 	movdqa	%xmm5,%xmm3
 	movdqa	%xmm2,%xmm1
 .byte	102,15,58,15,208,8
@@ -1476,7 +1498,7 @@ sha512_block_data_order:
 	movdqa	%xmm1,-16(%edx)
 	leal	128(%ebp),%ebp
 	decl	%ecx
-	jnz	.L00800_47_ssse3
+	jnz	.L00600_47_ssse3
 	movdqa	(%ebp),%xmm1
 	leal	-640(%ebp),%ebp
 	movdqu	(%ebx),%xmm0
@@ -2288,7 +2310,7 @@ sha512_block_data_order:
 	movq	%mm6,48(%esi)
 	movq	%mm7,56(%esi)
 	cmpl	%eax,%edi
-	jb	.L007loop_ssse3
+	jb	.L005loop_ssse3
 	movl	76(%edx),%esp
 	emms
 	popl	%edi
@@ -2296,456 +2318,9 @@ sha512_block_data_order:
 	popl	%ebx
 	popl	%ebp
 	ret
-.align	16
-.L002loop_x86:
-	movl	(%edi),%eax
-	movl	4(%edi),%ebx
-	movl	8(%edi),%ecx
-	movl	12(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	16(%edi),%eax
-	movl	20(%edi),%ebx
-	movl	24(%edi),%ecx
-	movl	28(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	32(%edi),%eax
-	movl	36(%edi),%ebx
-	movl	40(%edi),%ecx
-	movl	44(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	48(%edi),%eax
-	movl	52(%edi),%ebx
-	movl	56(%edi),%ecx
-	movl	60(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	64(%edi),%eax
-	movl	68(%edi),%ebx
-	movl	72(%edi),%ecx
-	movl	76(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	80(%edi),%eax
-	movl	84(%edi),%ebx
-	movl	88(%edi),%ecx
-	movl	92(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	96(%edi),%eax
-	movl	100(%edi),%ebx
-	movl	104(%edi),%ecx
-	movl	108(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	112(%edi),%eax
-	movl	116(%edi),%ebx
-	movl	120(%edi),%ecx
-	movl	124(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	addl	$128,%edi
-	subl	$72,%esp
-	movl	%edi,204(%esp)
-	leal	8(%esp),%edi
-	movl	$16,%ecx
-.long	2784229001
-.align	16
-.L00900_15_x86:
-	movl	40(%esp),%ecx
-	movl	44(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$9,%ecx
-	movl	%edx,%edi
-	shrl	$9,%edx
-	movl	%ecx,%ebx
-	shll	$14,%esi
-	movl	%edx,%eax
-	shll	$14,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%eax
-	shll	$4,%esi
-	xorl	%edx,%ebx
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$4,%ecx
-	xorl	%edi,%eax
-	shrl	$4,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	48(%esp),%ecx
-	movl	52(%esp),%edx
-	movl	56(%esp),%esi
-	movl	60(%esp),%edi
-	addl	64(%esp),%eax
-	adcl	68(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	andl	40(%esp),%ecx
-	andl	44(%esp),%edx
-	addl	192(%esp),%eax
-	adcl	196(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	movl	(%ebp),%esi
-	movl	4(%ebp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	32(%esp),%ecx
-	movl	36(%esp),%edx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	%eax,32(%esp)
-	movl	%ebx,36(%esp)
-	movl	%ecx,%esi
-	shrl	$2,%ecx
-	movl	%edx,%edi
-	shrl	$2,%edx
-	movl	%ecx,%ebx
-	shll	$4,%esi
-	movl	%edx,%eax
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%ebx
-	shll	$21,%esi
-	xorl	%edx,%eax
-	shll	$21,%edi
-	xorl	%esi,%eax
-	shrl	$21,%ecx
-	xorl	%edi,%ebx
-	shrl	$21,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	16(%esp),%esi
-	movl	20(%esp),%edi
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	andl	24(%esp),%ecx
-	andl	28(%esp),%edx
-	andl	8(%esp),%esi
-	andl	12(%esp),%edi
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movb	(%ebp),%dl
-	subl	$8,%esp
-	leal	8(%ebp),%ebp
-	cmpb	$148,%dl
-	jne	.L00900_15_x86
-.align	16
-.L01016_79_x86:
-	movl	312(%esp),%ecx
-	movl	316(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$1,%ecx
-	movl	%edx,%edi
-	shrl	$1,%edx
-	movl	%ecx,%eax
-	shll	$24,%esi
-	movl	%edx,%ebx
-	shll	$24,%edi
-	xorl	%esi,%ebx
-	shrl	$6,%ecx
-	xorl	%edi,%eax
-	shrl	$6,%edx
-	xorl	%ecx,%eax
-	shll	$7,%esi
-	xorl	%edx,%ebx
-	shll	$1,%edi
-	xorl	%esi,%ebx
-	shrl	$1,%ecx
-	xorl	%edi,%eax
-	shrl	$1,%edx
-	xorl	%ecx,%eax
-	shll	$6,%edi
-	xorl	%edx,%ebx
-	xorl	%edi,%eax
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movl	208(%esp),%ecx
-	movl	212(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$6,%ecx
-	movl	%edx,%edi
-	shrl	$6,%edx
-	movl	%ecx,%eax
-	shll	$3,%esi
-	movl	%edx,%ebx
-	shll	$3,%edi
-	xorl	%esi,%eax
-	shrl	$13,%ecx
-	xorl	%edi,%ebx
-	shrl	$13,%edx
-	xorl	%ecx,%eax
-	shll	$10,%esi
-	xorl	%edx,%ebx
-	shll	$10,%edi
-	xorl	%esi,%ebx
-	shrl	$10,%ecx
-	xorl	%edi,%eax
-	shrl	$10,%edx
-	xorl	%ecx,%ebx
-	shll	$13,%edi
-	xorl	%edx,%eax
-	xorl	%edi,%eax
-	movl	320(%esp),%ecx
-	movl	324(%esp),%edx
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	movl	248(%esp),%esi
-	movl	252(%esp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,192(%esp)
-	movl	%ebx,196(%esp)
-	movl	40(%esp),%ecx
-	movl	44(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$9,%ecx
-	movl	%edx,%edi
-	shrl	$9,%edx
-	movl	%ecx,%ebx
-	shll	$14,%esi
-	movl	%edx,%eax
-	shll	$14,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%eax
-	shll	$4,%esi
-	xorl	%edx,%ebx
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$4,%ecx
-	xorl	%edi,%eax
-	shrl	$4,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	48(%esp),%ecx
-	movl	52(%esp),%edx
-	movl	56(%esp),%esi
-	movl	60(%esp),%edi
-	addl	64(%esp),%eax
-	adcl	68(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	andl	40(%esp),%ecx
-	andl	44(%esp),%edx
-	addl	192(%esp),%eax
-	adcl	196(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	movl	(%ebp),%esi
-	movl	4(%ebp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	32(%esp),%ecx
-	movl	36(%esp),%edx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	%eax,32(%esp)
-	movl	%ebx,36(%esp)
-	movl	%ecx,%esi
-	shrl	$2,%ecx
-	movl	%edx,%edi
-	shrl	$2,%edx
-	movl	%ecx,%ebx
-	shll	$4,%esi
-	movl	%edx,%eax
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%ebx
-	shll	$21,%esi
-	xorl	%edx,%eax
-	shll	$21,%edi
-	xorl	%esi,%eax
-	shrl	$21,%ecx
-	xorl	%edi,%ebx
-	shrl	$21,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	16(%esp),%esi
-	movl	20(%esp),%edi
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	andl	24(%esp),%ecx
-	andl	28(%esp),%edx
-	andl	8(%esp),%esi
-	andl	12(%esp),%edi
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movb	(%ebp),%dl
-	subl	$8,%esp
-	leal	8(%ebp),%ebp
-	cmpb	$23,%dl
-	jne	.L01016_79_x86
-	movl	840(%esp),%esi
-	movl	844(%esp),%edi
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%edx
-	addl	8(%esp),%eax
-	adcl	12(%esp),%ebx
-	movl	%eax,(%esi)
-	movl	%ebx,4(%esi)
-	addl	16(%esp),%ecx
-	adcl	20(%esp),%edx
-	movl	%ecx,8(%esi)
-	movl	%edx,12(%esi)
-	movl	16(%esi),%eax
-	movl	20(%esi),%ebx
-	movl	24(%esi),%ecx
-	movl	28(%esi),%edx
-	addl	24(%esp),%eax
-	adcl	28(%esp),%ebx
-	movl	%eax,16(%esi)
-	movl	%ebx,20(%esi)
-	addl	32(%esp),%ecx
-	adcl	36(%esp),%edx
-	movl	%ecx,24(%esi)
-	movl	%edx,28(%esi)
-	movl	32(%esi),%eax
-	movl	36(%esi),%ebx
-	movl	40(%esi),%ecx
-	movl	44(%esi),%edx
-	addl	40(%esp),%eax
-	adcl	44(%esp),%ebx
-	movl	%eax,32(%esi)
-	movl	%ebx,36(%esi)
-	addl	48(%esp),%ecx
-	adcl	52(%esp),%edx
-	movl	%ecx,40(%esi)
-	movl	%edx,44(%esi)
-	movl	48(%esi),%eax
-	movl	52(%esi),%ebx
-	movl	56(%esi),%ecx
-	movl	60(%esi),%edx
-	addl	56(%esp),%eax
-	adcl	60(%esp),%ebx
-	movl	%eax,48(%esi)
-	movl	%ebx,52(%esi)
-	addl	64(%esp),%ecx
-	adcl	68(%esp),%edx
-	movl	%ecx,56(%esi)
-	movl	%edx,60(%esi)
-	addl	$840,%esp
-	subl	$640,%ebp
-	cmpl	8(%esp),%edi
-	jb	.L002loop_x86
-	movl	12(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
+.size	sha512_block_data_order_ssse3,.-.L_sha512_block_data_order_ssse3_begin
 .align	64
-.L001K512:
+.LK512:
 .long	3609767458,1116352408
 .long	602891725,1899447441
 .long	3964484399,3049323471
@@ -2828,14 +2403,12 @@ sha512_block_data_order:
 .long	1246189591,1816402316
 .long	67438087,66051
 .long	202182159,134810123
-.size	sha512_block_data_order,.-.L_sha512_block_data_order_begin
 .byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
 .byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
 .byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
 .byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
 .byte	62,0
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv4-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-linux.linux.arm.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-armv4-linux.S
index aa19360c..a65dd577 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv4-linux.linux.arm.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv4-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1855,7 +1854,6 @@ sha512_block_data_order_neon:
 .align	2
 .align	2
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
-#endif  // defined(__arm__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-apple.S
index a5b629fc..c4322431 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1596,7 +1595,6 @@ Loop_hw:
 
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-linux.S
index 0a57a1b1..b5a9da51 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1596,7 +1595,6 @@ sha512_block_data_order_hw:
 .size	sha512_block_data_order_hw,.-sha512_block_data_order_hw
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-win.S
new file mode 100644
index 00000000..5de6ea53
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-armv8-win.S
@@ -0,0 +1,1605 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <CCryptoBoringSSL_arm_arch.h>
+#endif
+
+.text
+
+.globl	sha512_block_data_order_nohw
+
+.def sha512_block_data_order_nohw
+   .type 32
+.endef
+.align	6
+sha512_block_data_order_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*8
+
+	ldp	x20,x21,[x0]				// load context
+	ldp	x22,x23,[x0,#2*8]
+	ldp	x24,x25,[x0,#4*8]
+	add	x2,x1,x2,lsl#7	// end of input
+	ldp	x26,x27,[x0,#6*8]
+	adrp	x30,LK512
+	add	x30,x30,:lo12:LK512
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	x3,x4,[x1],#2*8
+	ldr	x19,[x30],#8			// *K++
+	eor	x28,x21,x22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	x3,x3			// 0
+#endif
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x6,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x3			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x4,x4			// 1
+#endif
+	ldp	x5,x6,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x7,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x4			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x5,x5			// 2
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x8,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x5			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x6,x6			// 3
+#endif
+	ldp	x7,x8,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x9,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x6			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x7,x7			// 4
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x10,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x7			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x10,ror#18	// Sigma1(e)
+	ror	x10,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x10,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x8,x8			// 5
+#endif
+	ldp	x9,x10,[x1],#2*8
+	add	x23,x23,x17			// h+=Sigma0(a)
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x11,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x8			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x11,ror#18	// Sigma1(e)
+	ror	x11,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x11,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x9,x9			// 6
+#endif
+	add	x22,x22,x17			// h+=Sigma0(a)
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x12,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x9			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x12,ror#18	// Sigma1(e)
+	ror	x12,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x12,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x10,x10			// 7
+#endif
+	ldp	x11,x12,[x1],#2*8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	eor	x13,x25,x25,ror#23
+	and	x17,x26,x25
+	bic	x28,x27,x25
+	add	x20,x20,x10			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x13,ror#18	// Sigma1(e)
+	ror	x13,x21,#28
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	eor	x17,x21,x21,ror#5
+	add	x20,x20,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x24,x24,x20			// d+=h
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x13,x17,ror#34	// Sigma0(a)
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x20,x20,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x11,x11			// 8
+#endif
+	add	x20,x20,x17			// h+=Sigma0(a)
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x14,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x11			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x14,ror#18	// Sigma1(e)
+	ror	x14,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x14,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x12,x12			// 9
+#endif
+	ldp	x13,x14,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x15,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x12			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x15,ror#18	// Sigma1(e)
+	ror	x15,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x15,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x13,x13			// 10
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x0,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x13			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x0,ror#18	// Sigma1(e)
+	ror	x0,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x0,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x14,x14			// 11
+#endif
+	ldp	x15,x0,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x6,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x14			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x15,x15			// 12
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x7,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x15			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x0,x0			// 13
+#endif
+	ldp	x1,x2,[x1]
+	add	x23,x23,x17			// h+=Sigma0(a)
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x8,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x0			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x1,x1			// 14
+#endif
+	ldr	x6,[sp,#24]
+	add	x22,x22,x17			// h+=Sigma0(a)
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x9,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x1			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x2,x2			// 15
+#endif
+	ldr	x7,[sp,#0]
+	add	x21,x21,x17			// h+=Sigma0(a)
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+Loop_16_xx:
+	ldr	x8,[sp,#8]
+	str	x11,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x10,x5,#1
+	and	x17,x25,x24
+	ror	x9,x2,#19
+	bic	x19,x26,x24
+	ror	x11,x20,#28
+	add	x27,x27,x3			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x10,x10,x5,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x11,x11,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x9,x9,x2,ror#61
+	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x11,x20,ror#39	// Sigma0(a)
+	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
+	add	x4,x4,x13
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x4,x4,x10
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x4,x4,x9
+	ldr	x9,[sp,#16]
+	str	x12,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x11,x6,#1
+	and	x17,x24,x23
+	ror	x10,x3,#19
+	bic	x28,x25,x23
+	ror	x12,x27,#28
+	add	x26,x26,x4			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x11,x11,x6,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x12,x12,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x10,x10,x3,ror#61
+	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x12,x27,ror#39	// Sigma0(a)
+	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
+	add	x5,x5,x14
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x5,x5,x11
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x5,x5,x10
+	ldr	x10,[sp,#24]
+	str	x13,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x12,x7,#1
+	and	x17,x23,x22
+	ror	x11,x4,#19
+	bic	x19,x24,x22
+	ror	x13,x26,#28
+	add	x25,x25,x5			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x12,x12,x7,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x13,x13,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x11,x11,x4,ror#61
+	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x13,x26,ror#39	// Sigma0(a)
+	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
+	add	x6,x6,x15
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x6,x6,x12
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x6,x6,x11
+	ldr	x11,[sp,#0]
+	str	x14,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x13,x8,#1
+	and	x17,x22,x21
+	ror	x12,x5,#19
+	bic	x28,x23,x21
+	ror	x14,x25,#28
+	add	x24,x24,x6			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x13,x13,x8,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x14,x14,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x12,x12,x5,ror#61
+	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x14,x25,ror#39	// Sigma0(a)
+	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
+	add	x7,x7,x0
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x7,x7,x13
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x7,x7,x12
+	ldr	x12,[sp,#8]
+	str	x15,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x14,x9,#1
+	and	x17,x21,x20
+	ror	x13,x6,#19
+	bic	x19,x22,x20
+	ror	x15,x24,#28
+	add	x23,x23,x7			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x14,x14,x9,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x15,x15,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x13,x13,x6,ror#61
+	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x15,x24,ror#39	// Sigma0(a)
+	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
+	add	x8,x8,x1
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x8,x8,x14
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x8,x8,x13
+	ldr	x13,[sp,#16]
+	str	x0,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x15,x10,#1
+	and	x17,x20,x27
+	ror	x14,x7,#19
+	bic	x28,x21,x27
+	ror	x0,x23,#28
+	add	x22,x22,x8			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x15,x15,x10,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x0,x0,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x14,x14,x7,ror#61
+	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x0,x23,ror#39	// Sigma0(a)
+	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
+	add	x9,x9,x2
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x9,x9,x15
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x9,x9,x14
+	ldr	x14,[sp,#24]
+	str	x1,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x0,x11,#1
+	and	x17,x27,x26
+	ror	x15,x8,#19
+	bic	x19,x20,x26
+	ror	x1,x22,#28
+	add	x21,x21,x9			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x0,x0,x11,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x1,x1,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x15,x15,x8,ror#61
+	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x1,x22,ror#39	// Sigma0(a)
+	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
+	add	x10,x10,x3
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x10,x10,x0
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x10,x10,x15
+	ldr	x15,[sp,#0]
+	str	x2,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x1,x12,#1
+	and	x17,x26,x25
+	ror	x0,x9,#19
+	bic	x28,x27,x25
+	ror	x2,x21,#28
+	add	x20,x20,x10			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x1,x1,x12,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x2,x2,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x0,x0,x9,ror#61
+	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x2,x21,ror#39	// Sigma0(a)
+	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
+	add	x11,x11,x4
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x11,x11,x1
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x11,x11,x0
+	ldr	x0,[sp,#8]
+	str	x3,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x2,x13,#1
+	and	x17,x25,x24
+	ror	x1,x10,#19
+	bic	x19,x26,x24
+	ror	x3,x20,#28
+	add	x27,x27,x11			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x2,x2,x13,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x3,x3,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x1,x1,x10,ror#61
+	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x3,x20,ror#39	// Sigma0(a)
+	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
+	add	x12,x12,x5
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x12,x12,x2
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x12,x12,x1
+	ldr	x1,[sp,#16]
+	str	x4,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x3,x14,#1
+	and	x17,x24,x23
+	ror	x2,x11,#19
+	bic	x28,x25,x23
+	ror	x4,x27,#28
+	add	x26,x26,x12			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x3,x3,x14,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x4,x4,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x2,x2,x11,ror#61
+	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x4,x27,ror#39	// Sigma0(a)
+	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
+	add	x13,x13,x6
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x13,x13,x3
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x13,x13,x2
+	ldr	x2,[sp,#24]
+	str	x5,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x4,x15,#1
+	and	x17,x23,x22
+	ror	x3,x12,#19
+	bic	x19,x24,x22
+	ror	x5,x26,#28
+	add	x25,x25,x13			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x4,x4,x15,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x5,x5,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x3,x3,x12,ror#61
+	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x5,x26,ror#39	// Sigma0(a)
+	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
+	add	x14,x14,x7
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x14,x14,x4
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x14,x14,x3
+	ldr	x3,[sp,#0]
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x5,x0,#1
+	and	x17,x22,x21
+	ror	x4,x13,#19
+	bic	x28,x23,x21
+	ror	x6,x25,#28
+	add	x24,x24,x14			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x5,x5,x0,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x6,x6,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x4,x4,x13,ror#61
+	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x25,ror#39	// Sigma0(a)
+	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
+	add	x15,x15,x8
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x15,x15,x5
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x15,x15,x4
+	ldr	x4,[sp,#8]
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x6,x1,#1
+	and	x17,x21,x20
+	ror	x5,x14,#19
+	bic	x19,x22,x20
+	ror	x7,x24,#28
+	add	x23,x23,x15			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x6,x6,x1,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x7,x7,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x5,x5,x14,ror#61
+	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x24,ror#39	// Sigma0(a)
+	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
+	add	x0,x0,x9
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x0,x0,x6
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x0,x0,x5
+	ldr	x5,[sp,#16]
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x7,x2,#1
+	and	x17,x20,x27
+	ror	x6,x15,#19
+	bic	x28,x21,x27
+	ror	x8,x23,#28
+	add	x22,x22,x0			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x7,x7,x2,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x8,x8,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x6,x6,x15,ror#61
+	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x23,ror#39	// Sigma0(a)
+	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
+	add	x1,x1,x10
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x1,x1,x7
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x1,x1,x6
+	ldr	x6,[sp,#24]
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x8,x3,#1
+	and	x17,x27,x26
+	ror	x7,x0,#19
+	bic	x19,x20,x26
+	ror	x9,x22,#28
+	add	x21,x21,x1			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x8,x8,x3,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x9,x9,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x7,x7,x0,ror#61
+	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x22,ror#39	// Sigma0(a)
+	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
+	add	x2,x2,x11
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x2,x2,x8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x2,x2,x7
+	ldr	x7,[sp,#0]
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+	cbnz	x19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#648		// rewind
+
+	ldp	x3,x4,[x0]
+	ldp	x5,x6,[x0,#2*8]
+	add	x1,x1,#14*8			// advance input pointer
+	ldp	x7,x8,[x0,#4*8]
+	add	x20,x20,x3
+	ldp	x9,x10,[x0,#6*8]
+	add	x21,x21,x4
+	add	x22,x22,x5
+	add	x23,x23,x6
+	stp	x20,x21,[x0]
+	add	x24,x24,x7
+	add	x25,x25,x8
+	stp	x22,x23,[x0,#2*8]
+	add	x26,x26,x9
+	add	x27,x27,x10
+	cmp	x1,x2
+	stp	x24,x25,[x0,#4*8]
+	stp	x26,x27,[x0,#6*8]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*8
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	.rodata
+.align	6
+
+LK512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0	// terminator
+
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.globl	sha512_block_data_order_hw
+
+.def sha512_block_data_order_hw
+   .type 32
+.endef
+.align	6
+sha512_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,LK512
+	add	x3,x3,:lo12:LK512
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	Loop_hw
+
+.align	4
+Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-apple.S
index a5b8b3dc..7bfe0bb6 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -2978,7 +2977,6 @@ L$epilogue_avx:
 
 
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-linux.S
index c6f54669..2f8e9dce 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/sha512-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/sha512-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -2978,7 +2977,6 @@ _CET_ENDBR
 .cfi_endproc	
 .size	sha512_block_data_order_avx,.-sha512_block_data_order_avx
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv7-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-linux.linux.arm.S
rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv7-linux.S
index 42ae7354..d87e8703 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv7-linux.linux.arm.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv7-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1225,7 +1224,6 @@ vpaes_ctr32_encrypt_blocks:
 	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
 .size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
-#endif  // defined(__arm__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-apple.S
index 8172671d..bf67728f 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1224,7 +1223,6 @@ Lctr32_done:
 	ret
 
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-linux.S
index 6a7e2c27..c690fffc 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1224,7 +1223,6 @@ vpaes_ctr32_encrypt_blocks:
 	ret
 .size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-win.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-win.S
new file mode 100644
index 00000000..d0ae73aa
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-armv8-win.S
@@ -0,0 +1,1267 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <CCryptoBoringSSL_arm_arch.h>
+
+.section	.rodata
+
+
+.align	7	// totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward:	//	mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward:	//	mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr:	//	sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+Lk_inv:	//	inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt:	//	input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo:	//	sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1:	//	sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2:	//	sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Decryption stuff
+//
+Lk_dipt:	//	decryption input transform
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+Lk_dsbo:	//	decryption sbox final output
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+Lk_dsb9:	//	decryption sbox output *9*u, *9*t
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+Lk_dsbd:	//	decryption sbox output *D*u, *D*t
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+Lk_dsbb:	//	decryption sbox output *B*u, *B*t
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+Lk_dsbe:	//	decryption sbox output *E*u, *E*t
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+//  Key schedule constants
+//
+Lk_dksd:	//	decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb:	//	decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9:	//	decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon:	//	rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt:	//	output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+
+.align	6
+
+.text
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.def _vpaes_encrypt_preheat
+   .type 32
+.endef
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, Lk_inv
+	add	x10, x10, :lo12:Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// Lk_ipt, Lk_sbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// Lk_sb1, Lk_sb2
+	ret
+
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.def _vpaes_encrypt_core
+   .type 32
+.endef
+.align	4
+_vpaes_encrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward+16
+	add	x11, x11, :lo12:Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	Lenc_entry
+
+.align	4
+Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+
+
+.globl	vpaes_encrypt
+
+.def vpaes_encrypt
+   .type 32
+.endef
+.align	4
+vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_encrypt_preheat
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.def _vpaes_encrypt_2x
+   .type 32
+.endef
+.align	4
+_vpaes_encrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward+16
+	add	x11, x11, :lo12:Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b,  v17.16b
+	ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	tbl	v9.16b,  {v20.16b}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	tbl	v10.16b, {v21.16b}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,   v10.16b
+	b	Lenc_2x_entry
+
+.align	4
+Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	tbl	v12.16b, {v25.16b}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v24.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	tbl	v13.16b, {v27.16b}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	tbl	v10.16b, {v26.16b}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b, v17.16b
+	ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	tbl	v13.16b, {v19.16b},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v23.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+
+
+.def _vpaes_decrypt_preheat
+   .type 32
+.endef
+.align	4
+_vpaes_decrypt_preheat:
+	adrp	x10, Lk_inv
+	add	x10, x10, :lo12:Lk_inv
+	movi	v17.16b, #0x0f
+	adrp	x11, Lk_dipt
+	add	x11, x11, :lo12:Lk_dipt
+	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// Lk_dipt, Lk_dsbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// Lk_dsb9, Lk_dsbd
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// Lk_dsbb, Lk_dsbe
+	ret
+
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.def _vpaes_decrypt_core
+   .type 32
+.endef
+.align	4
+_vpaes_decrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, Lk_sr
+	add	x10, x10, :lo12:Lk_sr
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, Lk_mc_forward+48
+	add	x10, x10, :lo12:Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	Ldec_entry
+
+.align	4
+Ldec_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+Ldec_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, Ldec_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
+	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	ret
+
+
+.globl	vpaes_decrypt
+
+.def vpaes_decrypt
+   .type 32
+.endef
+.align	4
+vpaes_decrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_decrypt_preheat
+	bl	_vpaes_decrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// v14-v15 input, v0-v1 output
+.def _vpaes_decrypt_2x
+   .type 32
+.endef
+.align	4
+_vpaes_decrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, Lk_sr
+	add	x10, x10, :lo12:Lk_sr
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, Lk_mc_forward+48
+	add	x10, x10, :lo12:Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b, v17.16b
+	ushr	v8.16b,  v15.16b, #4
+	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	tbl	v10.16b, {v20.16b},v9.16b
+	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	tbl	v8.16b,  {v21.16b},v8.16b
+	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v10.16b, v10.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,  v10.16b
+	b	Ldec_2x_entry
+
+.align	4
+Ldec_2x_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v12.16b, {v24.16b}, v10.16b
+	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	tbl	v9.16b,  {v25.16b}, v11.16b
+	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
+	eor	v8.16b,  v12.16b, v16.16b
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v12.16b, {v26.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	tbl	v9.16b,  {v27.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v12.16b, {v28.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	tbl	v9.16b,  {v29.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v12.16b, {v30.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	tbl	v9.16b,  {v31.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+Ldec_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b,  v17.16b
+	ushr	v8.16b,  v8.16b,  #4
+	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	tbl	v10.16b, {v19.16b},v9.16b
+	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,	 v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v10.16b
+	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v10.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, Ldec_2x_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	tbl	v9.16b,  {v23.16b}, v11.16b
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	eor	v8.16b,  v9.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v2.16b
+	ret
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.def _vpaes_key_preheat
+   .type 32
+.endef
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, Lk_inv
+	add	x10, x10, :lo12:Lk_inv
+	movi	v16.16b, #0x5b			// Lk_s63
+	adrp	x11, Lk_sb1
+	add	x11, x11, :lo12:Lk_sb1
+	movi	v17.16b, #0x0f			// Lk_s0F
+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// Lk_inv, Lk_ipt
+	adrp	x10, Lk_dksd
+	add	x10, x10, :lo12:Lk_dksd
+	ld1	{v22.2d,v23.2d}, [x11]		// Lk_sb1
+	adrp	x11, Lk_mc_forward
+	add	x11, x11, :lo12:Lk_mc_forward
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// Lk_dksd, Lk_dksb
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// Lk_dkse, Lk_dks9
+	ld1	{v8.2d}, [x10]			// Lk_rcon
+	ld1	{v9.2d}, [x11]			// Lk_mc_forward[0]
+	ret
+
+
+.def _vpaes_schedule_core
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, Lk_sr		// lea	Lk_sr(%rip),%r10
+	add	x10, x10, :lo12:Lk_sr
+
+	add	x8, x8, x10
+	cbnz	w3, Lschedule_am_decrypting
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
+	b	Lschedule_go
+
+Lschedule_am_decrypting:
+	// decrypting, output zeroth round key after shiftrows
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	eor	x8, x8, #0x30			// xor	$0x30, %r8
+
+Lschedule_go:
+	cmp	w1, #192			// cmp	$192,	%esi
+	b.hi	Lschedule_256
+	b.eq	Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+Lschedule_128:
+	mov	x0, #10			// mov	$10, %esi
+
+Loop_schedule_128:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b	Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+Lschedule_192:
+	sub	x0, x0, #8
+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	x0, #4			// mov	$4,	%esi
+
+Loop_schedule_192:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+Lschedule_256:
+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	x0, #7			// mov	$7, %esi
+
+Loop_schedule_256:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, Lk_deskew	// lea	Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, :lo12:Lk_deskew
+
+	cbnz	w3, Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, Lk_opt		// lea	Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, :lo12:Lk_opt
+	add	x2, x2, #32			// add	$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+Lschedule_mangle_last_dec:
+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
+	sub	x2, x2, #16			// add	$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.def _vpaes_schedule_192_smear
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.def _vpaes_schedule_round
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	eor	v7.16b, v7.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.def _vpaes_schedule_transform
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.def _vpaes_schedule_mangle
+   .type 32
+.endef
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	cbnz	w3, Lschedule_mangle_dec
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm4
+	add	x2, x2, #16			// add	$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+	b	Lschedule_mangle_both
+.align	4
+Lschedule_mangle_dec:
+	// inverse mix columns
+						// lea	.Lk_dksd(%rip),%r11
+	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
+	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+						// vmovdqa	0x00(%r11),	%xmm2
+	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+						// vmovdqa	0x10(%r11),	%xmm3
+	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x20(%r11),	%xmm2
+	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x30(%r11),	%xmm3
+	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x40(%r11),	%xmm2
+	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x50(%r11),	%xmm3
+	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+
+						// vmovdqa	0x60(%r11),	%xmm2
+	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+						// vmovdqa	0x70(%r11),	%xmm4
+	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	x2, x2, #16			// add	$-16,	%rdx
+
+Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #48			// add	$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+
+
+.globl	vpaes_set_encrypt_key
+
+.def vpaes_set_encrypt_key
+   .type 32
+.endef
+.align	4
+vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	w3, #0		// mov	$0,%ecx
+	mov	x8, #0x30		// mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	vpaes_set_decrypt_key
+
+.def vpaes_set_decrypt_key
+   .type 32
+.endef
+.align	4
+vpaes_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	w9, w9, #4		// shl	$4,%eax
+	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
+	add	x2, x2, x9
+
+	mov	w3, #1		// mov	$1,%ecx
+	lsr	w8, w1, #1		// shr	$1,%r8d
+	and	x8, x8, #32		// and	$32,%r8d
+	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	vpaes_cbc_encrypt
+
+.def vpaes_cbc_encrypt
+   .type 32
+.endef
+.align	4
+vpaes_cbc_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	cbz	x2, Lcbc_abort
+	cmp	w5, #0			// check direction
+	b.eq	vpaes_cbc_decrypt
+
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+
+	ld1	{v0.16b}, [x4]	// load ivec
+	bl	_vpaes_encrypt_preheat
+	b	Lcbc_enc_loop
+
+.align	4
+Lcbc_enc_loop:
+	ld1	{v7.16b}, [x0],#16	// load input
+	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1],#16	// save output
+	subs	x17, x17, #16
+	b.hi	Lcbc_enc_loop
+
+	st1	{v0.16b}, [x4]	// write ivec
+
+	ldp	x29,x30,[sp],#16
+Lcbc_abort:
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.def vpaes_cbc_decrypt
+   .type 32
+.endef
+.align	4
+vpaes_cbc_decrypt:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+	// only from vpaes_cbc_encrypt which has already signed the return address.
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+	ld1	{v6.16b}, [x4]	// load ivec
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	Lcbc_dec_loop2x
+
+	ld1	{v7.16b}, [x0], #16	// load input
+	bl	_vpaes_decrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	orr	v6.16b, v7.16b, v7.16b	// next ivec value
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #16
+	b.ls	Lcbc_dec_done
+
+.align	4
+Lcbc_dec_loop2x:
+	ld1	{v14.16b,v15.16b}, [x0], #32
+	bl	_vpaes_decrypt_2x
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	eor	v1.16b, v1.16b, v14.16b
+	orr	v6.16b, v15.16b, v15.16b
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #32
+	b.hi	Lcbc_dec_loop2x
+
+Lcbc_dec_done:
+	st1	{v6.16b}, [x4]
+
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	vpaes_ctr32_encrypt_blocks
+
+.def vpaes_ctr32_encrypt_blocks
+   .type 32
+.endef
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	x2, Lctr32_done
+
+	// Note, unlike the other functions, x2 here is measured in blocks,
+	// not bytes.
+	mov	x17, x2
+	mov	x2,  x3
+
+	// Load the IV and counter portion.
+	ldr	w6, [x4, #12]
+	ld1	{v7.16b}, [x4]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	w6, w6		// The counter is big-endian.
+	b.eq	Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [x0], #16	// Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v7.s[3], w7
+	b.ls	Lctr32_done
+
+Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v15.s[3], w7
+
+Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [x0], #32	// Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	w7, w6, #1
+	add	w6, w6, #2
+	rev	w7, w7
+	mov	v14.s[3], w7
+	rev	w7, w6
+	mov	v15.s[3], w7
+	b.hi	Lctr32_loop
+
+Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-apple.S
new file mode 100644
index 00000000..28f72d3b
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-apple.S
@@ -0,0 +1,685 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.align	6,0x90
+L_vpaes_consts:
+.long	218628480,235210255,168496130,67568393
+.long	252381056,17041926,33884169,51187212
+.long	252645135,252645135,252645135,252645135
+.long	1512730624,3266504856,1377990664,3401244816
+.long	830229760,1275146365,2969422977,3447763452
+.long	3411033600,2979783055,338359620,2782886510
+.long	4209124096,907596821,221174255,1006095553
+.long	191964160,3799684038,3164090317,1589111125
+.long	182528256,1777043520,2877432650,3265356744
+.long	1874708224,3503451415,3305285752,363511674
+.long	1606117888,3487855781,1093350906,2384367825
+.long	197121,67569157,134941193,202313229
+.long	67569157,134941193,202313229,197121
+.long	134941193,202313229,197121,67569157
+.long	202313229,197121,67569157,134941193
+.long	33619971,100992007,168364043,235736079
+.long	235736079,33619971,100992007,168364043
+.long	168364043,235736079,33619971,100992007
+.long	100992007,168364043,235736079,33619971
+.long	50462976,117835012,185207048,252579084
+.long	252314880,51251460,117574920,184942860
+.long	184682752,252054788,50987272,118359308
+.long	118099200,185467140,251790600,50727180
+.long	2946363062,528716217,1300004225,1881839624
+.long	1532713819,1532713819,1532713819,1532713819
+.long	3602276352,4288629033,3737020424,4153884961
+.long	1354558464,32357713,2958822624,3775749553
+.long	1201988352,132424512,1572796698,503232858
+.long	2213177600,1597421020,4103937655,675398315
+.long	2749646592,4273543773,1511898873,121693092
+.long	3040248576,1103263732,2871565598,1608280554
+.long	2236667136,2588920351,482954393,64377734
+.long	3069987328,291237287,2117370568,3650299247
+.long	533321216,3573750986,2572112006,1401264716
+.long	1339849704,2721158661,548607111,3445553514
+.long	2128193280,3054596040,2183486460,1257083700
+.long	655635200,1165381986,3923443150,2344132524
+.long	190078720,256924420,290342170,357187870
+.long	1610966272,2263057382,4103205268,309794674
+.long	2592527872,2233205587,1335446729,3402964816
+.long	3973531904,3225098121,3002836325,1918774430
+.long	3870401024,2102906079,2284471353,4117666579
+.long	617007872,1021508343,366931923,691083277
+.long	2528395776,3491914898,2968704004,1613121270
+.long	3445188352,3247741094,844474987,4093578302
+.long	651481088,1190302358,1689581232,574775300
+.long	4289380608,206939853,2555985458,2489840491
+.long	2130264064,327674451,3566485037,3349835193
+.long	2470714624,316102159,3636825756,3393945945
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte	118,101,114,115,105,116,121,41,0
+.align	6,0x90
+.private_extern	__vpaes_preheat
+.align	4
+__vpaes_preheat:
+	addl	(%esp),%ebp
+	movdqa	-48(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm6
+	ret
+.private_extern	__vpaes_encrypt_core
+.align	4
+__vpaes_encrypt_core:
+	movl	$16,%ecx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
+	movdqu	(%edx),%xmm5
+.byte	102,15,56,0,208
+	movdqa	16(%ebp),%xmm0
+	pxor	%xmm5,%xmm2
+	psrld	$4,%xmm1
+	addl	$16,%edx
+.byte	102,15,56,0,193
+	leal	192(%ebp),%ebx
+	pxor	%xmm2,%xmm0
+	jmp	L000enc_entry
+.align	4,0x90
+L001enc_loop:
+	movdqa	32(%ebp),%xmm4
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	64(%ebp),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%ebx,%ecx,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	80(%ebp),%xmm2
+	movdqa	(%ebx,%ecx,1),%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addl	$16,%edx
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addl	$16,%ecx
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	subl	$1,%eax
+	pxor	%xmm3,%xmm0
+L000enc_entry:
+	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm7,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%edx),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	L001enc_loop
+	movdqa	96(%ebp),%xmm4
+	movdqa	112(%ebp),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%ebx,%ecx,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+.private_extern	__vpaes_decrypt_core
+.align	4
+__vpaes_decrypt_core:
+	leal	608(%ebp),%ebx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	-64(%ebx),%xmm2
+	pandn	%xmm0,%xmm1
+	movl	%eax,%ecx
+	psrld	$4,%xmm1
+	movdqu	(%edx),%xmm5
+	shll	$4,%ecx
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,208
+	movdqa	-48(%ebx),%xmm0
+	xorl	$48,%ecx
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	pxor	%xmm5,%xmm2
+	movdqa	176(%ebp),%xmm5
+	pxor	%xmm2,%xmm0
+	addl	$16,%edx
+	leal	-352(%ebx,%ecx,1),%ecx
+	jmp	L002dec_entry
+.align	4,0x90
+L003dec_loop:
+	movdqa	-32(%ebx),%xmm4
+	movdqa	-16(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	32(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	64(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	addl	$16,%edx
+.byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subl	$1,%eax
+L002dec_entry:
+	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
+	psrld	$4,%xmm1
+.byte	102,15,56,0,208
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm7,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%edx),%xmm0
+	pxor	%xmm1,%xmm3
+	jnz	L003dec_loop
+	movdqa	96(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%ebx),%xmm0
+	movdqa	(%ecx),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	ret
+.private_extern	__vpaes_schedule_core
+.align	4
+__vpaes_schedule_core:
+	addl	(%esp),%ebp
+	movdqu	(%esi),%xmm0
+	movdqa	320(%ebp),%xmm2
+	movdqa	%xmm0,%xmm3
+	leal	(%ebp),%ebx
+	movdqa	%xmm2,4(%esp)
+	call	__vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+	testl	%edi,%edi
+	jnz	L004schedule_am_decrypting
+	movdqu	%xmm0,(%edx)
+	jmp	L005schedule_go
+L004schedule_am_decrypting:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%edx)
+	xorl	$48,%ecx
+L005schedule_go:
+	cmpl	$192,%eax
+	ja	L006schedule_256
+	je	L007schedule_192
+L008schedule_128:
+	movl	$10,%eax
+L009loop_schedule_128:
+	call	__vpaes_schedule_round
+	decl	%eax
+	jz	L010schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	jmp	L009loop_schedule_128
+.align	4,0x90
+L007schedule_192:
+	movdqu	8(%esi),%xmm0
+	call	__vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%eax
+L011loop_schedule_192:
+	call	__vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	__vpaes_schedule_mangle
+	call	__vpaes_schedule_192_smear
+	call	__vpaes_schedule_mangle
+	call	__vpaes_schedule_round
+	decl	%eax
+	jz	L010schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	call	__vpaes_schedule_192_smear
+	jmp	L011loop_schedule_192
+.align	4,0x90
+L006schedule_256:
+	movdqu	16(%esi),%xmm0
+	call	__vpaes_schedule_transform
+	movl	$7,%eax
+L012loop_schedule_256:
+	call	__vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+	call	__vpaes_schedule_round
+	decl	%eax
+	jz	L010schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	pshufd	$255,%xmm0,%xmm0
+	movdqa	%xmm7,20(%esp)
+	movdqa	%xmm6,%xmm7
+	call	L_vpaes_schedule_low_round
+	movdqa	20(%esp),%xmm7
+	jmp	L012loop_schedule_256
+.align	4,0x90
+L010schedule_mangle_last:
+	leal	384(%ebp),%ebx
+	testl	%edi,%edi
+	jnz	L013schedule_mangle_last_dec
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,193
+	leal	352(%ebp),%ebx
+	addl	$32,%edx
+L013schedule_mangle_last_dec:
+	addl	$-16,%edx
+	pxor	336(%ebp),%xmm0
+	call	__vpaes_schedule_transform
+	movdqu	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+.private_extern	__vpaes_schedule_192_smear
+.align	4
+__vpaes_schedule_192_smear:
+	pshufd	$128,%xmm6,%xmm1
+	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	movhlps	%xmm1,%xmm6
+	ret
+.private_extern	__vpaes_schedule_round
+.align	4
+__vpaes_schedule_round:
+	movdqa	8(%esp),%xmm2
+	pxor	%xmm1,%xmm1
+.byte	102,15,58,15,202,15
+.byte	102,15,58,15,210,15
+	pxor	%xmm1,%xmm7
+	pshufd	$255,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+	movdqa	%xmm2,8(%esp)
+L_vpaes_schedule_low_round:
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	336(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm4
+	movdqa	-48(%ebp),%xmm5
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm4,%xmm0
+	movdqa	-32(%ebp),%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm5,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm5,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	32(%ebp),%xmm4
+.byte	102,15,56,0,226
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+.private_extern	__vpaes_schedule_transform
+.align	4
+__vpaes_schedule_transform:
+	movdqa	-16(%ebp),%xmm2
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	movdqa	(%ebx),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%ebx),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+.private_extern	__vpaes_schedule_mangle
+.align	4
+__vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	128(%ebp),%xmm5
+	testl	%edi,%edi
+	jnz	L014schedule_mangle_dec
+	addl	$16,%edx
+	pxor	336(%ebp),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+	jmp	L015schedule_mangle_both
+.align	4,0x90
+L014schedule_mangle_dec:
+	movdqa	-16(%ebp),%xmm2
+	leal	416(%ebp),%esi
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm4
+	movdqa	(%esi),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	32(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	64(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	96(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	addl	$-16,%edx
+L015schedule_mangle_both:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	addl	$-16,%ecx
+	andl	$48,%ecx
+	movdqu	%xmm3,(%edx)
+	ret
+.globl	_vpaes_set_encrypt_key
+.private_extern	_vpaes_set_encrypt_key
+.align	4
+_vpaes_set_encrypt_key:
+L_vpaes_set_encrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L016pic_for_function_hit
+L016pic_for_function_hit:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+5-L016pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	movl	$48,%ecx
+	movl	$0,%edi
+	leal	L_vpaes_consts+0x30-L017pic_point,%ebp
+	call	__vpaes_schedule_core
+L017pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_vpaes_set_decrypt_key
+.private_extern	_vpaes_set_decrypt_key
+.align	4
+_vpaes_set_decrypt_key:
+L_vpaes_set_decrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	shll	$4,%ebx
+	leal	16(%edx,%ebx,1),%edx
+	movl	$1,%edi
+	movl	%eax,%ecx
+	shrl	$1,%ecx
+	andl	$32,%ecx
+	xorl	$32,%ecx
+	leal	L_vpaes_consts+0x30-L018pic_point,%ebp
+	call	__vpaes_schedule_core
+L018pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_vpaes_encrypt
+.private_extern	_vpaes_encrypt
+.align	4
+_vpaes_encrypt:
+L_vpaes_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	L019pic_for_function_hit
+L019pic_for_function_hit:
+	popl	%ebx
+	leal	_BORINGSSL_function_hit+4-L019pic_for_function_hit(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	leal	L_vpaes_consts+0x30-L020pic_point,%ebp
+	call	__vpaes_preheat
+L020pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	__vpaes_encrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_vpaes_decrypt
+.private_extern	_vpaes_decrypt
+.align	4
+_vpaes_decrypt:
+L_vpaes_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	leal	L_vpaes_consts+0x30-L021pic_point,%ebp
+	call	__vpaes_preheat
+L021pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	__vpaes_decrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_vpaes_cbc_encrypt
+.private_extern	_vpaes_cbc_encrypt
+.align	4
+_vpaes_cbc_encrypt:
+L_vpaes_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	subl	$16,%eax
+	jc	L022cbc_abort
+	leal	-56(%esp),%ebx
+	movl	36(%esp),%ebp
+	andl	$-16,%ebx
+	movl	40(%esp),%ecx
+	xchgl	%esp,%ebx
+	movdqu	(%ebp),%xmm1
+	subl	%esi,%edi
+	movl	%ebx,48(%esp)
+	movl	%edi,(%esp)
+	movl	%edx,4(%esp)
+	movl	%ebp,8(%esp)
+	movl	%eax,%edi
+	leal	L_vpaes_consts+0x30-L023pic_point,%ebp
+	call	__vpaes_preheat
+L023pic_point:
+	cmpl	$0,%ecx
+	je	L024cbc_dec_loop
+	jmp	L025cbc_enc_loop
+.align	4,0x90
+L025cbc_enc_loop:
+	movdqu	(%esi),%xmm0
+	pxor	%xmm1,%xmm0
+	call	__vpaes_encrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	movdqa	%xmm0,%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	L025cbc_enc_loop
+	jmp	L026cbc_done
+.align	4,0x90
+L024cbc_dec_loop:
+	movdqu	(%esi),%xmm0
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm0,32(%esp)
+	call	__vpaes_decrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	pxor	16(%esp),%xmm0
+	movdqa	32(%esp),%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	L024cbc_dec_loop
+L026cbc_done:
+	movl	8(%esp),%ebx
+	movl	48(%esp),%esp
+	movdqu	%xmm1,(%ebx)
+L022cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-linux.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-linux.S
index e1f6de82..516d092d 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -489,10 +488,10 @@ vpaes_set_encrypt_key:
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	.L016pic
-.L016pic:
+	call	.L016pic_for_function_hit
+.L016pic_for_function_hit:
 	popl	%ebx
-	leal	BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx
+	leal	BORINGSSL_function_hit+5-.L016pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
@@ -574,10 +573,10 @@ vpaes_encrypt:
 #ifdef BORINGSSL_DISPATCH_TEST
 	pushl	%ebx
 	pushl	%edx
-	call	.L019pic
-.L019pic:
+	call	.L019pic_for_function_hit
+.L019pic_for_function_hit:
 	popl	%ebx
-	leal	BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx
+	leal	BORINGSSL_function_hit+4-.L019pic_for_function_hit(%ebx),%ebx
 	movl	$1,%edx
 	movb	%dl,(%ebx)
 	popl	%edx
@@ -706,7 +705,6 @@ vpaes_cbc_encrypt:
 	ret
 .size	vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-apple.S
index 1b126a6c..32a84f12 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1131,7 +1130,6 @@ L$ctr_add_two:
 
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-linux.S
index bc55da78..2a491c84 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/vpaes-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/vpaes-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1133,7 +1132,6 @@ _vpaes_consts:
 .size	_vpaes_consts,.-_vpaes_consts
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-apple.S b/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-apple.S
new file mode 100644
index 00000000..81253d99
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-apple.S
@@ -0,0 +1,226 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_bn_mul_mont
+.private_extern	_bn_mul_mont
+.align	4
+_bn_mul_mont:
+L_bn_mul_mont_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%eax,%eax
+	movl	40(%esp),%edi
+	cmpl	$4,%edi
+	jl	L000just_leave
+	leal	20(%esp),%esi
+	leal	24(%esp),%edx
+	addl	$2,%edi
+	negl	%edi
+	leal	-32(%esp,%edi,4),%ebp
+	negl	%edi
+	movl	%ebp,%eax
+	subl	%edx,%eax
+	andl	$2047,%eax
+	subl	%eax,%ebp
+	xorl	%ebp,%edx
+	andl	$2048,%edx
+	xorl	$2048,%edx
+	subl	%edx,%ebp
+	andl	$-64,%ebp
+	movl	%esp,%eax
+	subl	%ebp,%eax
+	andl	$-4096,%eax
+	movl	%esp,%edx
+	leal	(%ebp,%eax,1),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	L001page_walk
+	jmp	L002page_walk_done
+.align	4,0x90
+L001page_walk:
+	leal	-4096(%esp),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	L001page_walk
+L002page_walk_done:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebp
+	movl	16(%esi),%esi
+	movl	(%esi),%esi
+	movl	%eax,4(%esp)
+	movl	%ebx,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	%ebp,16(%esp)
+	movl	%esi,20(%esp)
+	leal	-3(%edi),%ebx
+	movl	%edx,24(%esp)
+	movl	$-1,%eax
+	movd	%eax,%mm7
+	movl	8(%esp),%esi
+	movl	12(%esp),%edi
+	movl	16(%esp),%ebp
+	xorl	%edx,%edx
+	xorl	%ecx,%ecx
+	movd	(%edi),%mm4
+	movd	(%esi),%mm5
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	movq	%mm5,%mm2
+	movq	%mm5,%mm0
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	incl	%ecx
+.align	4,0x90
+L0031st:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	leal	1(%ecx),%ecx
+	cmpl	%ebx,%ecx
+	jl	L0031st
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm2,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	incl	%edx
+L004outer:
+	xorl	%ecx,%ecx
+	movd	(%edi,%edx,4),%mm4
+	movd	(%esi),%mm5
+	movd	32(%esp),%mm6
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	paddq	%mm6,%mm5
+	movq	%mm5,%mm0
+	movq	%mm5,%mm2
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	36(%esp),%mm6
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	incl	%ecx
+	decl	%ebx
+L005inner:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	movd	36(%esp,%ecx,4),%mm6
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	decl	%ebx
+	leal	1(%ecx),%ecx
+	jnz	L005inner
+	movl	%ecx,%ebx
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	movd	36(%esp,%ebx,4),%mm6
+	paddq	%mm2,%mm3
+	paddq	%mm6,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	leal	1(%edx),%edx
+	cmpl	%ebx,%edx
+	jle	L004outer
+	emms
+	jmp	L006common_tail
+.align	4,0x90
+L006common_tail:
+	movl	16(%esp),%ebp
+	movl	4(%esp),%edi
+	leal	32(%esp),%esi
+	movl	(%esi),%eax
+	movl	%ebx,%ecx
+	xorl	%edx,%edx
+.align	4,0x90
+L007sub:
+	sbbl	(%ebp,%edx,4),%eax
+	movl	%eax,(%edi,%edx,4)
+	decl	%ecx
+	movl	4(%esi,%edx,4),%eax
+	leal	1(%edx),%edx
+	jge	L007sub
+	sbbl	$0,%eax
+	movl	$-1,%edx
+	xorl	%eax,%edx
+	jmp	L008copy
+.align	4,0x90
+L008copy:
+	movl	32(%esp,%ebx,4),%esi
+	movl	(%edi,%ebx,4),%ebp
+	movl	%ecx,32(%esp,%ebx,4)
+	andl	%eax,%esi
+	andl	%edx,%ebp
+	orl	%esi,%ebp
+	movl	%ebp,(%edi,%ebx,4)
+	decl	%ebx
+	jge	L008copy
+	movl	24(%esp),%esp
+	movl	$1,%eax
+L000just_leave:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte	111,114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-linux.S b/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-linux.S
new file mode 100644
index 00000000..c0e0ef07
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/bcm/x86-mont-linux.S
@@ -0,0 +1,228 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl	bn_mul_mont
+.hidden	bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+.L_bn_mul_mont_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%eax,%eax
+	movl	40(%esp),%edi
+	cmpl	$4,%edi
+	jl	.L000just_leave
+	leal	20(%esp),%esi
+	leal	24(%esp),%edx
+	addl	$2,%edi
+	negl	%edi
+	leal	-32(%esp,%edi,4),%ebp
+	negl	%edi
+	movl	%ebp,%eax
+	subl	%edx,%eax
+	andl	$2047,%eax
+	subl	%eax,%ebp
+	xorl	%ebp,%edx
+	andl	$2048,%edx
+	xorl	$2048,%edx
+	subl	%edx,%ebp
+	andl	$-64,%ebp
+	movl	%esp,%eax
+	subl	%ebp,%eax
+	andl	$-4096,%eax
+	movl	%esp,%edx
+	leal	(%ebp,%eax,1),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+	jmp	.L002page_walk_done
+.align	16
+.L001page_walk:
+	leal	-4096(%esp),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+.L002page_walk_done:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebp
+	movl	16(%esi),%esi
+	movl	(%esi),%esi
+	movl	%eax,4(%esp)
+	movl	%ebx,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	%ebp,16(%esp)
+	movl	%esi,20(%esp)
+	leal	-3(%edi),%ebx
+	movl	%edx,24(%esp)
+	movl	$-1,%eax
+	movd	%eax,%mm7
+	movl	8(%esp),%esi
+	movl	12(%esp),%edi
+	movl	16(%esp),%ebp
+	xorl	%edx,%edx
+	xorl	%ecx,%ecx
+	movd	(%edi),%mm4
+	movd	(%esi),%mm5
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	movq	%mm5,%mm2
+	movq	%mm5,%mm0
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	incl	%ecx
+.align	16
+.L0031st:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	leal	1(%ecx),%ecx
+	cmpl	%ebx,%ecx
+	jl	.L0031st
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm2,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	incl	%edx
+.L004outer:
+	xorl	%ecx,%ecx
+	movd	(%edi,%edx,4),%mm4
+	movd	(%esi),%mm5
+	movd	32(%esp),%mm6
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	paddq	%mm6,%mm5
+	movq	%mm5,%mm0
+	movq	%mm5,%mm2
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	36(%esp),%mm6
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	incl	%ecx
+	decl	%ebx
+.L005inner:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	movd	36(%esp,%ecx,4),%mm6
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	decl	%ebx
+	leal	1(%ecx),%ecx
+	jnz	.L005inner
+	movl	%ecx,%ebx
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	movd	36(%esp,%ebx,4),%mm6
+	paddq	%mm2,%mm3
+	paddq	%mm6,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	leal	1(%edx),%edx
+	cmpl	%ebx,%edx
+	jle	.L004outer
+	emms
+	jmp	.L006common_tail
+.align	16
+.L006common_tail:
+	movl	16(%esp),%ebp
+	movl	4(%esp),%edi
+	leal	32(%esp),%esi
+	movl	(%esi),%eax
+	movl	%ebx,%ecx
+	xorl	%edx,%edx
+.align	16
+.L007sub:
+	sbbl	(%ebp,%edx,4),%eax
+	movl	%eax,(%edi,%edx,4)
+	decl	%ecx
+	movl	4(%esi,%edx,4),%eax
+	leal	1(%edx),%edx
+	jge	.L007sub
+	sbbl	$0,%eax
+	movl	$-1,%edx
+	xorl	%eax,%edx
+	jmp	.L008copy
+.align	16
+.L008copy:
+	movl	32(%esp,%ebx,4),%esi
+	movl	(%edi,%ebx,4),%ebp
+	movl	%ecx,32(%esp,%ebx,4)
+	andl	%eax,%esi
+	andl	%edx,%ebp
+	orl	%esi,%ebp
+	movl	%ebp,(%edi,%ebx,4)
+	decl	%ebx
+	jge	.L008copy
+	movl	24(%esp),%esp
+	movl	$1,%eax
+.L000just_leave:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_mont,.-.L_bn_mul_mont_begin
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte	111,114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-apple.S
index eb39df37..a6e7a2c8 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1235,7 +1234,6 @@ L$mulx4x_epilogue:
 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .p2align	4
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-linux.S
index 22565fe1..9a500d70 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1237,7 +1236,6 @@ _CET_ENDBR
 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	16
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-apple.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-apple.S
index d3a698ca..4b003639 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -8,26 +7,18 @@
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
 .text	
 
-
-
-.globl	_bn_mul_mont_gather5
-.private_extern _bn_mul_mont_gather5
+.globl	_bn_mul_mont_gather5_nohw
+.private_extern _bn_mul_mont_gather5_nohw
 
 .p2align	6
-_bn_mul_mont_gather5:
+_bn_mul_mont_gather5_nohw:
 
 _CET_ENDBR
+
+
 	movl	%r9d,%r9d
 	movq	%rsp,%rax
 
-	testl	$7,%r9d
-	jnz	L$mul_enter
-	leaq	_OPENSSL_ia32cap_P(%rip),%r11
-	movl	8(%r11),%r11d
-	jmp	L$mul4x_enter
-
-.p2align	4
-L$mul_enter:
 	movd	8(%rsp),%xmm5
 	pushq	%rbx
 
@@ -454,17 +445,16 @@ L$mul_epilogue:
 	ret
 
 
+.globl	_bn_mul4x_mont_gather5
+.private_extern _bn_mul4x_mont_gather5
 
 .p2align	5
-bn_mul4x_mont_gather5:
+_bn_mul4x_mont_gather5:
 
+_CET_ENDBR
 .byte	0x67
 	movq	%rsp,%rax
 
-L$mul4x_enter:
-	andl	$0x80108,%r11d
-	cmpl	$0x80108,%r11d
-	je	L$mulx4x_enter
 	pushq	%rbx
 
 	pushq	%rbp
@@ -480,6 +470,9 @@ L$mul4x_enter:
 L$mul4x_prologue:
 
 .byte	0x67
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -1089,20 +1082,15 @@ L$inner4x:
 	jmp	L$sqr4x_sub_entry
 
 
-.globl	_bn_power5
-.private_extern _bn_power5
+.globl	_bn_power5_nohw
+.private_extern _bn_power5_nohw
 
 .p2align	5
-_bn_power5:
+_bn_power5_nohw:
 
 _CET_ENDBR
 	movq	%rsp,%rax
 
-	leaq	_OPENSSL_ia32cap_P(%rip),%r11
-	movl	8(%r11),%r11d
-	andl	$0x80108,%r11d
-	cmpl	$0x80108,%r11d
-	je	L$powerx5_enter
 	pushq	%rbx
 
 	pushq	%rbp
@@ -1117,6 +1105,9 @@ _CET_ENDBR
 
 L$power5_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leal	(%r9,%r9,2),%r10d
 	negq	%r9
@@ -2068,13 +2059,15 @@ L$sqr4x_sub_entry:
 	ret
 
 
+.globl	_bn_mulx4x_mont_gather5
+.private_extern _bn_mulx4x_mont_gather5
 
 .p2align	5
-bn_mulx4x_mont_gather5:
+_bn_mulx4x_mont_gather5:
 
+_CET_ENDBR
 	movq	%rsp,%rax
 
-L$mulx4x_enter:
 	pushq	%rbx
 
 	pushq	%rbp
@@ -2089,6 +2082,9 @@ L$mulx4x_enter:
 
 L$mulx4x_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -2605,13 +2601,15 @@ L$mulx4x_inner:
 	jmp	L$sqrx4x_sub_entry
 
 
+.globl	_bn_powerx5
+.private_extern _bn_powerx5
 
 .p2align	5
-bn_powerx5:
+_bn_powerx5:
 
+_CET_ENDBR
 	movq	%rsp,%rax
 
-L$powerx5_enter:
 	pushq	%rbx
 
 	pushq	%rbp
@@ -2626,6 +2624,9 @@ L$powerx5_enter:
 
 L$powerx5_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -3624,7 +3625,6 @@ L$inc:
 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-linux.S
similarity index 98%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-linux.S
index 894fe5fd..6cec7db0 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/x86_64-mont5-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/bcm/x86_64-mont5-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -8,27 +7,18 @@
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
 .text	
 
-.extern	OPENSSL_ia32cap_P
-.hidden OPENSSL_ia32cap_P
-
-.globl	bn_mul_mont_gather5
-.hidden bn_mul_mont_gather5
-.type	bn_mul_mont_gather5,@function
+.globl	bn_mul_mont_gather5_nohw
+.hidden bn_mul_mont_gather5_nohw
+.type	bn_mul_mont_gather5_nohw,@function
 .align	64
-bn_mul_mont_gather5:
+bn_mul_mont_gather5_nohw:
 .cfi_startproc	
 _CET_ENDBR
+
+
 	movl	%r9d,%r9d
 	movq	%rsp,%rax
 .cfi_def_cfa_register	%rax
-	testl	$7,%r9d
-	jnz	.Lmul_enter
-	leaq	OPENSSL_ia32cap_P(%rip),%r11
-	movl	8(%r11),%r11d
-	jmp	.Lmul4x_enter
-
-.align	16
-.Lmul_enter:
 	movd	8(%rsp),%xmm5
 	pushq	%rbx
 .cfi_offset	%rbx,-16
@@ -454,18 +444,17 @@ _CET_ENDBR
 .Lmul_epilogue:
 	ret
 .cfi_endproc	
-.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.size	bn_mul_mont_gather5_nohw,.-bn_mul_mont_gather5_nohw
+.globl	bn_mul4x_mont_gather5
+.hidden bn_mul4x_mont_gather5
 .type	bn_mul4x_mont_gather5,@function
 .align	32
 bn_mul4x_mont_gather5:
 .cfi_startproc	
+_CET_ENDBR
 .byte	0x67
 	movq	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lmul4x_enter:
-	andl	$0x80108,%r11d
-	cmpl	$0x80108,%r11d
-	je	.Lmulx4x_enter
 	pushq	%rbx
 .cfi_offset	%rbx,-16
 	pushq	%rbp
@@ -481,6 +470,9 @@ bn_mul4x_mont_gather5:
 .Lmul4x_prologue:
 
 .byte	0x67
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -1090,20 +1082,15 @@ mul4x_internal:
 	jmp	.Lsqr4x_sub_entry
 .cfi_endproc	
 .size	mul4x_internal,.-mul4x_internal
-.globl	bn_power5
-.hidden bn_power5
-.type	bn_power5,@function
+.globl	bn_power5_nohw
+.hidden bn_power5_nohw
+.type	bn_power5_nohw,@function
 .align	32
-bn_power5:
+bn_power5_nohw:
 .cfi_startproc	
 _CET_ENDBR
 	movq	%rsp,%rax
 .cfi_def_cfa_register	%rax
-	leaq	OPENSSL_ia32cap_P(%rip),%r11
-	movl	8(%r11),%r11d
-	andl	$0x80108,%r11d
-	cmpl	$0x80108,%r11d
-	je	.Lpowerx5_enter
 	pushq	%rbx
 .cfi_offset	%rbx,-16
 	pushq	%rbp
@@ -1118,6 +1105,9 @@ _CET_ENDBR
 .cfi_offset	%r15,-56
 .Lpower5_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leal	(%r9,%r9,2),%r10d
 	negq	%r9
@@ -1226,7 +1216,7 @@ _CET_ENDBR
 .Lpower5_epilogue:
 	ret
 .cfi_endproc	
-.size	bn_power5,.-bn_power5
+.size	bn_power5_nohw,.-bn_power5_nohw
 
 .globl	bn_sqr8x_internal
 .hidden bn_sqr8x_internal
@@ -2069,13 +2059,15 @@ __bn_post4x_internal:
 	ret
 .cfi_endproc	
 .size	__bn_post4x_internal,.-__bn_post4x_internal
+.globl	bn_mulx4x_mont_gather5
+.hidden bn_mulx4x_mont_gather5
 .type	bn_mulx4x_mont_gather5,@function
 .align	32
 bn_mulx4x_mont_gather5:
 .cfi_startproc	
+_CET_ENDBR
 	movq	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lmulx4x_enter:
 	pushq	%rbx
 .cfi_offset	%rbx,-16
 	pushq	%rbp
@@ -2090,6 +2082,9 @@ bn_mulx4x_mont_gather5:
 .cfi_offset	%r15,-56
 .Lmulx4x_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -2606,13 +2601,15 @@ mulx4x_internal:
 	jmp	.Lsqrx4x_sub_entry
 .cfi_endproc	
 .size	mulx4x_internal,.-mulx4x_internal
+.globl	bn_powerx5
+.hidden bn_powerx5
 .type	bn_powerx5,@function
 .align	32
 bn_powerx5:
 .cfi_startproc	
+_CET_ENDBR
 	movq	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lpowerx5_enter:
 	pushq	%rbx
 .cfi_offset	%rbx,-16
 	pushq	%rbp
@@ -2627,6 +2624,9 @@ bn_powerx5:
 .cfi_offset	%r15,-56
 .Lpowerx5_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -3625,7 +3625,6 @@ _CET_ENDBR
 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .text	
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-apple.S
index 9b474516..54a78781 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -3081,7 +3080,6 @@ _CET_ENDBR
 
 
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-linux.S
index 67d9f8ab..cd46241a 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/aes128gcmsiv-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/aes128gcmsiv-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -3091,7 +3090,6 @@ _CET_ENDBR
 .cfi_endproc	
 .size	aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-linux.linux.arm.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv4-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-linux.linux.arm.S
rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-armv4-linux.S
index 9a546d9c..75cd8b27 100644
--- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv4-linux.linux.arm.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv4-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__arm__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1451,7 +1450,6 @@ ChaCha20_ctr32_neon:
 .size	ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
 #endif
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
-#endif  // defined(__arm__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-apple.S
index 675f6a33..d82a7d53 100644
--- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1968,7 +1967,6 @@ Ldone_512_neon:
 	ret
 
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-linux.S
index 983f3af0..183a3b79 100644
--- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -1968,7 +1967,6 @@ ChaCha20_512_neon:
 	ret
 .size	ChaCha20_512_neon,.-ChaCha20_512_neon
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-win.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-win.S
new file mode 100644
index 00000000..3d766f37
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-armv8-win.S
@@ -0,0 +1,1979 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <CCryptoBoringSSL_arm_arch.h>
+
+.section	.rodata
+
+.align	5
+Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+Lone:
+.long	1,0,0,0
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.text
+
+.globl	ChaCha20_ctr32_nohw
+
+.def ChaCha20_ctr32_nohw
+   .type 32
+.endef
+.align	5
+ChaCha20_ctr32_nohw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma
+	add	x5,x5,:lo12:Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__AARCH64EB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.align	4
+Ltail:
+	add	x2,x2,#64
+Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	ChaCha20_ctr32_neon
+
+.def ChaCha20_ctr32_neon
+   .type 32
+.endef
+.align	5
+ChaCha20_ctr32_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma
+	add	x5,x5,:lo12:Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	Last_neon
+
+Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	Last_neon
+Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	Last_neon
+
+.align	4
+Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.def ChaCha20_512_neon
+   .type 32
+.endef
+.align	5
+ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma
+	add	x5,x5,:lo12:Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	Loop_outer
+
+Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-apple.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-apple.S
new file mode 100644
index 00000000..997b2a2c
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-apple.S
@@ -0,0 +1,962 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_ChaCha20_ctr32_nohw
+.private_extern	_ChaCha20_ctr32_nohw
+.align	4
+_ChaCha20_ctr32_nohw:
+L_ChaCha20_ctr32_nohw_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	32(%esp),%esi
+	movl	36(%esp),%edi
+	subl	$132,%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,80(%esp)
+	movl	%ebx,84(%esp)
+	movl	%ecx,88(%esp)
+	movl	%edx,92(%esp)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	movl	%eax,96(%esp)
+	movl	%ebx,100(%esp)
+	movl	%ecx,104(%esp)
+	movl	%edx,108(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	subl	$1,%eax
+	movl	%eax,112(%esp)
+	movl	%ebx,116(%esp)
+	movl	%ecx,120(%esp)
+	movl	%edx,124(%esp)
+	jmp	L000entry
+.align	4,0x90
+L001outer_loop:
+	movl	%ebx,156(%esp)
+	movl	%eax,152(%esp)
+	movl	%ecx,160(%esp)
+L000entry:
+	movl	$1634760805,%eax
+	movl	$857760878,4(%esp)
+	movl	$2036477234,8(%esp)
+	movl	$1797285236,12(%esp)
+	movl	84(%esp),%ebx
+	movl	88(%esp),%ebp
+	movl	104(%esp),%ecx
+	movl	108(%esp),%esi
+	movl	116(%esp),%edx
+	movl	120(%esp),%edi
+	movl	%ebx,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ecx,40(%esp)
+	movl	%esi,44(%esp)
+	movl	%edx,52(%esp)
+	movl	%edi,56(%esp)
+	movl	92(%esp),%ebx
+	movl	124(%esp),%edi
+	movl	112(%esp),%edx
+	movl	80(%esp),%ebp
+	movl	96(%esp),%ecx
+	movl	100(%esp),%esi
+	addl	$1,%edx
+	movl	%ebx,28(%esp)
+	movl	%edi,60(%esp)
+	movl	%edx,112(%esp)
+	movl	$10,%ebx
+	jmp	L002loop
+.align	4,0x90
+L002loop:
+	addl	%ebp,%eax
+	movl	%ebx,128(%esp)
+	movl	%ebp,%ebx
+	xorl	%eax,%edx
+	roll	$16,%edx
+	addl	%edx,%ecx
+	xorl	%ecx,%ebx
+	movl	52(%esp),%edi
+	roll	$12,%ebx
+	movl	20(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,(%esp)
+	roll	$8,%edx
+	movl	4(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,48(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	movl	%ecx,32(%esp)
+	roll	$16,%edi
+	movl	%ebx,16(%esp)
+	addl	%edi,%esi
+	movl	40(%esp),%ecx
+	xorl	%esi,%ebp
+	movl	56(%esp),%edx
+	roll	$12,%ebp
+	movl	24(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,4(%esp)
+	roll	$8,%edi
+	movl	8(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,52(%esp)
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	movl	%esi,36(%esp)
+	roll	$16,%edx
+	movl	%ebp,20(%esp)
+	addl	%edx,%ecx
+	movl	44(%esp),%esi
+	xorl	%ecx,%ebx
+	movl	60(%esp),%edi
+	roll	$12,%ebx
+	movl	28(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,8(%esp)
+	roll	$8,%edx
+	movl	12(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,56(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	roll	$16,%edi
+	movl	%ebx,24(%esp)
+	addl	%edi,%esi
+	xorl	%esi,%ebp
+	roll	$12,%ebp
+	movl	20(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,12(%esp)
+	roll	$8,%edi
+	movl	(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,%edx
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	roll	$16,%edx
+	movl	%ebp,28(%esp)
+	addl	%edx,%ecx
+	xorl	%ecx,%ebx
+	movl	48(%esp),%edi
+	roll	$12,%ebx
+	movl	24(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,(%esp)
+	roll	$8,%edx
+	movl	4(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,60(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	movl	%ecx,40(%esp)
+	roll	$16,%edi
+	movl	%ebx,20(%esp)
+	addl	%edi,%esi
+	movl	32(%esp),%ecx
+	xorl	%esi,%ebp
+	movl	52(%esp),%edx
+	roll	$12,%ebp
+	movl	28(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,4(%esp)
+	roll	$8,%edi
+	movl	8(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,48(%esp)
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	movl	%esi,44(%esp)
+	roll	$16,%edx
+	movl	%ebp,24(%esp)
+	addl	%edx,%ecx
+	movl	36(%esp),%esi
+	xorl	%ecx,%ebx
+	movl	56(%esp),%edi
+	roll	$12,%ebx
+	movl	16(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,8(%esp)
+	roll	$8,%edx
+	movl	12(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,52(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	roll	$16,%edi
+	movl	%ebx,28(%esp)
+	addl	%edi,%esi
+	xorl	%esi,%ebp
+	movl	48(%esp),%edx
+	roll	$12,%ebp
+	movl	128(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,12(%esp)
+	roll	$8,%edi
+	movl	(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,56(%esp)
+	xorl	%esi,%ebp
+	roll	$7,%ebp
+	decl	%ebx
+	jnz	L002loop
+	movl	160(%esp),%ebx
+	addl	$1634760805,%eax
+	addl	80(%esp),%ebp
+	addl	96(%esp),%ecx
+	addl	100(%esp),%esi
+	cmpl	$64,%ebx
+	jb	L003tail
+	movl	156(%esp),%ebx
+	addl	112(%esp),%edx
+	addl	120(%esp),%edi
+	xorl	(%ebx),%eax
+	xorl	16(%ebx),%ebp
+	movl	%eax,(%esp)
+	movl	152(%esp),%eax
+	xorl	32(%ebx),%ecx
+	xorl	36(%ebx),%esi
+	xorl	48(%ebx),%edx
+	xorl	56(%ebx),%edi
+	movl	%ebp,16(%eax)
+	movl	%ecx,32(%eax)
+	movl	%esi,36(%eax)
+	movl	%edx,48(%eax)
+	movl	%edi,56(%eax)
+	movl	4(%esp),%ebp
+	movl	8(%esp),%ecx
+	movl	12(%esp),%esi
+	movl	20(%esp),%edx
+	movl	24(%esp),%edi
+	addl	$857760878,%ebp
+	addl	$2036477234,%ecx
+	addl	$1797285236,%esi
+	addl	84(%esp),%edx
+	addl	88(%esp),%edi
+	xorl	4(%ebx),%ebp
+	xorl	8(%ebx),%ecx
+	xorl	12(%ebx),%esi
+	xorl	20(%ebx),%edx
+	xorl	24(%ebx),%edi
+	movl	%ebp,4(%eax)
+	movl	%ecx,8(%eax)
+	movl	%esi,12(%eax)
+	movl	%edx,20(%eax)
+	movl	%edi,24(%eax)
+	movl	28(%esp),%ebp
+	movl	40(%esp),%ecx
+	movl	44(%esp),%esi
+	movl	52(%esp),%edx
+	movl	60(%esp),%edi
+	addl	92(%esp),%ebp
+	addl	104(%esp),%ecx
+	addl	108(%esp),%esi
+	addl	116(%esp),%edx
+	addl	124(%esp),%edi
+	xorl	28(%ebx),%ebp
+	xorl	40(%ebx),%ecx
+	xorl	44(%ebx),%esi
+	xorl	52(%ebx),%edx
+	xorl	60(%ebx),%edi
+	leal	64(%ebx),%ebx
+	movl	%ebp,28(%eax)
+	movl	(%esp),%ebp
+	movl	%ecx,40(%eax)
+	movl	160(%esp),%ecx
+	movl	%esi,44(%eax)
+	movl	%edx,52(%eax)
+	movl	%edi,60(%eax)
+	movl	%ebp,(%eax)
+	leal	64(%eax),%eax
+	subl	$64,%ecx
+	jnz	L001outer_loop
+	jmp	L004done
+L003tail:
+	addl	112(%esp),%edx
+	addl	120(%esp),%edi
+	movl	%eax,(%esp)
+	movl	%ebp,16(%esp)
+	movl	%ecx,32(%esp)
+	movl	%esi,36(%esp)
+	movl	%edx,48(%esp)
+	movl	%edi,56(%esp)
+	movl	4(%esp),%ebp
+	movl	8(%esp),%ecx
+	movl	12(%esp),%esi
+	movl	20(%esp),%edx
+	movl	24(%esp),%edi
+	addl	$857760878,%ebp
+	addl	$2036477234,%ecx
+	addl	$1797285236,%esi
+	addl	84(%esp),%edx
+	addl	88(%esp),%edi
+	movl	%ebp,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%esi,12(%esp)
+	movl	%edx,20(%esp)
+	movl	%edi,24(%esp)
+	movl	28(%esp),%ebp
+	movl	40(%esp),%ecx
+	movl	44(%esp),%esi
+	movl	52(%esp),%edx
+	movl	60(%esp),%edi
+	addl	92(%esp),%ebp
+	addl	104(%esp),%ecx
+	addl	108(%esp),%esi
+	addl	116(%esp),%edx
+	addl	124(%esp),%edi
+	movl	%ebp,28(%esp)
+	movl	156(%esp),%ebp
+	movl	%ecx,40(%esp)
+	movl	152(%esp),%ecx
+	movl	%esi,44(%esp)
+	xorl	%esi,%esi
+	movl	%edx,52(%esp)
+	movl	%edi,60(%esp)
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+L005tail_loop:
+	movb	(%esi,%ebp,1),%al
+	movb	(%esp,%esi,1),%dl
+	leal	1(%esi),%esi
+	xorb	%dl,%al
+	movb	%al,-1(%ecx,%esi,1)
+	decl	%ebx
+	jnz	L005tail_loop
+L004done:
+	addl	$132,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.globl	_ChaCha20_ctr32_ssse3
+.private_extern	_ChaCha20_ctr32_ssse3
+.align	4
+_ChaCha20_ctr32_ssse3:
+L_ChaCha20_ctr32_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	Lpic_point
+Lpic_point:
+	popl	%eax
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movl	28(%esp),%ecx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$524,%esp
+	andl	$-64,%esp
+	movl	%ebp,512(%esp)
+	leal	Lssse3_data-Lpic_point(%eax),%eax
+	movdqu	(%ebx),%xmm3
+	cmpl	$256,%ecx
+	jb	L0061x
+	movl	%edx,516(%esp)
+	movl	%ebx,520(%esp)
+	subl	$256,%ecx
+	leal	384(%esp),%ebp
+	movdqu	(%edx),%xmm7
+	pshufd	$0,%xmm3,%xmm0
+	pshufd	$85,%xmm3,%xmm1
+	pshufd	$170,%xmm3,%xmm2
+	pshufd	$255,%xmm3,%xmm3
+	paddd	48(%eax),%xmm0
+	pshufd	$0,%xmm7,%xmm4
+	pshufd	$85,%xmm7,%xmm5
+	psubd	64(%eax),%xmm0
+	pshufd	$170,%xmm7,%xmm6
+	pshufd	$255,%xmm7,%xmm7
+	movdqa	%xmm0,64(%ebp)
+	movdqa	%xmm1,80(%ebp)
+	movdqa	%xmm2,96(%ebp)
+	movdqa	%xmm3,112(%ebp)
+	movdqu	16(%edx),%xmm3
+	movdqa	%xmm4,-64(%ebp)
+	movdqa	%xmm5,-48(%ebp)
+	movdqa	%xmm6,-32(%ebp)
+	movdqa	%xmm7,-16(%ebp)
+	movdqa	32(%eax),%xmm7
+	leal	128(%esp),%ebx
+	pshufd	$0,%xmm3,%xmm0
+	pshufd	$85,%xmm3,%xmm1
+	pshufd	$170,%xmm3,%xmm2
+	pshufd	$255,%xmm3,%xmm3
+	pshufd	$0,%xmm7,%xmm4
+	pshufd	$85,%xmm7,%xmm5
+	pshufd	$170,%xmm7,%xmm6
+	pshufd	$255,%xmm7,%xmm7
+	movdqa	%xmm0,(%ebp)
+	movdqa	%xmm1,16(%ebp)
+	movdqa	%xmm2,32(%ebp)
+	movdqa	%xmm3,48(%ebp)
+	movdqa	%xmm4,-128(%ebp)
+	movdqa	%xmm5,-112(%ebp)
+	movdqa	%xmm6,-96(%ebp)
+	movdqa	%xmm7,-80(%ebp)
+	leal	128(%esi),%esi
+	leal	128(%edi),%edi
+	jmp	L007outer_loop
+.align	4,0x90
+L007outer_loop:
+	movdqa	-112(%ebp),%xmm1
+	movdqa	-96(%ebp),%xmm2
+	movdqa	-80(%ebp),%xmm3
+	movdqa	-48(%ebp),%xmm5
+	movdqa	-32(%ebp),%xmm6
+	movdqa	-16(%ebp),%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	movdqa	%xmm2,-96(%ebx)
+	movdqa	%xmm3,-80(%ebx)
+	movdqa	%xmm5,-48(%ebx)
+	movdqa	%xmm6,-32(%ebx)
+	movdqa	%xmm7,-16(%ebx)
+	movdqa	32(%ebp),%xmm2
+	movdqa	48(%ebp),%xmm3
+	movdqa	64(%ebp),%xmm4
+	movdqa	80(%ebp),%xmm5
+	movdqa	96(%ebp),%xmm6
+	movdqa	112(%ebp),%xmm7
+	paddd	64(%eax),%xmm4
+	movdqa	%xmm2,32(%ebx)
+	movdqa	%xmm3,48(%ebx)
+	movdqa	%xmm4,64(%ebx)
+	movdqa	%xmm5,80(%ebx)
+	movdqa	%xmm6,96(%ebx)
+	movdqa	%xmm7,112(%ebx)
+	movdqa	%xmm4,64(%ebp)
+	movdqa	-128(%ebp),%xmm0
+	movdqa	%xmm4,%xmm6
+	movdqa	-64(%ebp),%xmm3
+	movdqa	(%ebp),%xmm4
+	movdqa	16(%ebp),%xmm5
+	movl	$10,%edx
+	nop
+.align	4,0x90
+L008loop:
+	paddd	%xmm3,%xmm0
+	movdqa	%xmm3,%xmm2
+	pxor	%xmm0,%xmm6
+	pshufb	(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	pxor	%xmm4,%xmm2
+	movdqa	-48(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-112(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	80(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-128(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,64(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	movdqa	%xmm4,(%ebx)
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-64(%ebx)
+	paddd	%xmm7,%xmm5
+	movdqa	32(%ebx),%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	-32(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-96(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	96(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,80(%ebx)
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	movdqa	%xmm5,16(%ebx)
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-48(%ebx)
+	paddd	%xmm6,%xmm4
+	movdqa	48(%ebx),%xmm5
+	pxor	%xmm4,%xmm2
+	movdqa	-16(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-80(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	112(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-96(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,96(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-32(%ebx)
+	paddd	%xmm7,%xmm5
+	pxor	%xmm5,%xmm3
+	movdqa	-48(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-128(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-80(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,%xmm6
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-16(%ebx)
+	paddd	%xmm6,%xmm4
+	pxor	%xmm4,%xmm2
+	movdqa	-32(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-112(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	64(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-128(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,112(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	movdqa	%xmm4,32(%ebx)
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-48(%ebx)
+	paddd	%xmm7,%xmm5
+	movdqa	(%ebx),%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	-16(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-96(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	80(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,64(%ebx)
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	movdqa	%xmm5,48(%ebx)
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-32(%ebx)
+	paddd	%xmm6,%xmm4
+	movdqa	16(%ebx),%xmm5
+	pxor	%xmm4,%xmm2
+	movdqa	-64(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-80(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	96(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-96(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,80(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-16(%ebx)
+	paddd	%xmm7,%xmm5
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-128(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	64(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-80(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,96(%ebx)
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	por	%xmm1,%xmm3
+	decl	%edx
+	jnz	L008loop
+	movdqa	%xmm3,-64(%ebx)
+	movdqa	%xmm4,(%ebx)
+	movdqa	%xmm5,16(%ebx)
+	movdqa	%xmm6,64(%ebx)
+	movdqa	%xmm7,96(%ebx)
+	movdqa	-112(%ebx),%xmm1
+	movdqa	-96(%ebx),%xmm2
+	movdqa	-80(%ebx),%xmm3
+	paddd	-128(%ebp),%xmm0
+	paddd	-112(%ebp),%xmm1
+	paddd	-96(%ebp),%xmm2
+	paddd	-80(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	-64(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	-48(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	-32(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	-16(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	-64(%ebp),%xmm0
+	paddd	-48(%ebp),%xmm1
+	paddd	-32(%ebp),%xmm2
+	paddd	-16(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	16(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	32(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	48(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	(%ebp),%xmm0
+	paddd	16(%ebp),%xmm1
+	paddd	32(%ebp),%xmm2
+	paddd	48(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	64(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	80(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	96(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	112(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	64(%ebp),%xmm0
+	paddd	80(%ebp),%xmm1
+	paddd	96(%ebp),%xmm2
+	paddd	112(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	208(%esi),%esi
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm5
+	pxor	%xmm2,%xmm6
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	208(%edi),%edi
+	subl	$256,%ecx
+	jnc	L007outer_loop
+	addl	$256,%ecx
+	jz	L009done
+	movl	520(%esp),%ebx
+	leal	-128(%esi),%esi
+	movl	516(%esp),%edx
+	leal	-128(%edi),%edi
+	movd	64(%ebp),%xmm2
+	movdqu	(%ebx),%xmm3
+	paddd	96(%eax),%xmm2
+	pand	112(%eax),%xmm3
+	por	%xmm2,%xmm3
+L0061x:
+	movdqa	32(%eax),%xmm0
+	movdqu	(%edx),%xmm1
+	movdqu	16(%edx),%xmm2
+	movdqa	(%eax),%xmm6
+	movdqa	16(%eax),%xmm7
+	movl	%ebp,48(%esp)
+	movdqa	%xmm0,(%esp)
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm2,32(%esp)
+	movdqa	%xmm3,48(%esp)
+	movl	$10,%edx
+	jmp	L010loop1x
+.align	4,0x90
+L011outer1x:
+	movdqa	80(%eax),%xmm3
+	movdqa	(%esp),%xmm0
+	movdqa	16(%esp),%xmm1
+	movdqa	32(%esp),%xmm2
+	paddd	48(%esp),%xmm3
+	movl	$10,%edx
+	movdqa	%xmm3,48(%esp)
+	jmp	L010loop1x
+.align	4,0x90
+L010loop1x:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decl	%edx
+	jnz	L010loop1x
+	paddd	(%esp),%xmm0
+	paddd	16(%esp),%xmm1
+	paddd	32(%esp),%xmm2
+	paddd	48(%esp),%xmm3
+	cmpl	$64,%ecx
+	jb	L012tail
+	movdqu	(%esi),%xmm4
+	movdqu	16(%esi),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm5,%xmm1
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm3
+	leal	64(%esi),%esi
+	movdqu	%xmm0,(%edi)
+	movdqu	%xmm1,16(%edi)
+	movdqu	%xmm2,32(%edi)
+	movdqu	%xmm3,48(%edi)
+	leal	64(%edi),%edi
+	subl	$64,%ecx
+	jnz	L011outer1x
+	jmp	L009done
+L012tail:
+	movdqa	%xmm0,(%esp)
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm2,32(%esp)
+	movdqa	%xmm3,48(%esp)
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	xorl	%ebp,%ebp
+L013tail_loop:
+	movb	(%esp,%ebp,1),%al
+	movb	(%esi,%ebp,1),%dl
+	leal	1(%ebp),%ebp
+	xorb	%dl,%al
+	movb	%al,-1(%edi,%ebp,1)
+	decl	%ecx
+	jnz	L013tail_loop
+L009done:
+	movl	512(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	6,0x90
+Lssse3_data:
+.byte	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.byte	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.long	1634760805,857760878,2036477234,1797285236
+.long	0,1,2,3
+.long	4,4,4,4
+.long	1,0,0,0
+.long	4,0,0,0
+.long	0,-1,-1,-1
+.align	6,0x90
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+.byte	114,103,62,0
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-linux.S
index eb940347..73a49e6e 100644
--- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -961,7 +960,6 @@ ChaCha20_ctr32_ssse3:
 .byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
 .byte	114,103,62,0
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-apple.S
index 6a51e976..0a1f1b48 100644
--- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -461,7 +460,6 @@ _ChaCha20_ctr32_ssse3_4x:
 _CET_ENDBR
 	movq	%rsp,%r9
 
-	movq	%r10,%r11
 	subq	$0x140+8,%rsp
 	movdqa	L$sigma(%rip),%xmm11
 	movdqu	(%rcx),%xmm15
@@ -1604,7 +1602,6 @@ L$8x_epilogue:
 
 
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-linux.S
index 7a4759c5..dee0a994 100644
--- a/Sources/CCryptoBoringSSL/crypto/chacha/chacha-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -467,7 +466,6 @@ ChaCha20_ctr32_ssse3_4x:
 _CET_ENDBR
 	movq	%rsp,%r9
 .cfi_def_cfa_register	r9
-	movq	%r10,%r11
 	subq	$0x140+8,%rsp
 	movdqa	.Lsigma(%rip),%xmm11
 	movdqu	(%rcx),%xmm15
@@ -1610,7 +1608,6 @@ _CET_ENDBR
 .cfi_endproc	
 .size	ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-ios.ios.aarch64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-ios.ios.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-apple.S
index c39f8c08..ac2500d7 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-ios.ios.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -3009,7 +3008,6 @@ Lopen_128_hash_64:
 .cfi_endproc
 
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-linux.linux.aarch64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-linux.linux.aarch64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-linux.S
index 45387193..30ef5ce4 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_armv8-linux.linux.aarch64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__aarch64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -3009,7 +3008,6 @@ chacha20_poly1305_open:
 .cfi_endproc
 .size	chacha20_poly1305_open,.-chacha20_poly1305_open
 #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
-#endif  // defined(__aarch64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-win.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-win.S
new file mode 100644
index 00000000..e8952e59
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_armv8-win.S
@@ -0,0 +1,3020 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <CCryptoBoringSSL_arm_arch.h>
+.section	.rodata
+
+.align	7
+Lchacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+Linc:
+.long	1,2,3,4
+Lrol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+Lclamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+.def Lpoly_hash_ad_internal
+   .type 32
+.endef
+.align	6
+Lpoly_hash_ad_internal:
+.cfi_startproc
+	cbnz	x4, Lpoly_hash_intro
+	ret
+
+Lpoly_hash_intro:
+	cmp	x4, #16
+	b.lt	Lpoly_hash_ad_tail
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	Lpoly_hash_ad_internal
+
+Lpoly_hash_ad_tail:
+	cbz	x4, Lpoly_hash_ad_ret
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
+	sub	x4, x4, #1
+
+Lpoly_hash_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, x4]
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.ge	Lpoly_hash_tail_16_compose
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lpoly_hash_ad_ret:
+	ret
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl	chacha20_poly1305_seal
+
+.def chacha20_poly1305_seal
+   .type 32
+.endef
+.align	6
+chacha20_poly1305_seal:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
+	add	x12, x12, x2
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x12
+
+	cmp	x2, #128
+	b.le	Lseal_128 // Optimization for smaller buffers
+
+    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+    // the fifth block (A4-D4) horizontally.
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	sub	x5, x5, #32
+
+	mov	x6, #10
+
+.align	5
+Lseal_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lseal_init_rounds
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #4
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	and	v4.16b, v4.16b, v27.16b
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	mov	x16, v4.d[0] // Move the R key to GPRs
+	mov	x17, v4.d[1]
+	mov	v27.16b, v9.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+	mov	x3, x0
+	cmp	x2, #256
+	b.le	Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #256
+
+	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+Lseal_main_loop:
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	sub	x5, x5, #32
+.align	5
+Lseal_main_loop_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x6, x6, #1
+	b.ge	Lseal_main_loop_rounds
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	subs	x7, x7, #1
+	b.gt	Lseal_main_loop_rounds
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+	cmp	x2, #320
+	b.le	Lseal_tail
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #320
+
+	mov	x6, #0
+	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+	b	Lseal_main_loop
+
+Lseal_tail:
+    // This part of the function handles the storage and authentication of the last [0,320) bytes
+    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+	cmp	x2, #64
+	b.lt	Lseal_tail_64
+
+    // Store and authenticate 64B blocks per iteration
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+    // Shift the state left by 64 bytes for the next iteration of the loop
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+	mov	v1.16b, v2.16b
+	mov	v6.16b, v7.16b
+	mov	v11.16b, v12.16b
+	mov	v16.16b, v17.16b
+
+	mov	v2.16b, v3.16b
+	mov	v7.16b, v8.16b
+	mov	v12.16b, v13.16b
+	mov	v17.16b, v18.16b
+
+	mov	v3.16b, v4.16b
+	mov	v8.16b, v9.16b
+	mov	v13.16b, v14.16b
+	mov	v18.16b, v19.16b
+
+	b	Lseal_tail
+
+Lseal_tail_64:
+	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
+
+    // Here we handle the last [0,64) bytes of plaintext
+	cmp	x2, #16
+	b.lt	Lseal_tail_16
+    // Each iteration encrypt and authenticate a 16B block
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	st1	{v20.16b}, [x0], #16
+
+	sub	x2, x2, #16
+
+    // Shift the state left by 16 bytes for the next iteration of the loop
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+
+	b	Lseal_tail_64
+
+Lseal_tail_16:
+    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+	cbz	x2, Lseal_hash_extra
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+	not	v22.16b, v20.16b
+
+	mov	x6, x2
+	add	x1, x1, x2
+
+	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+	mov	x7, #16          // We need to load some extra_in first for padding
+	sub	x7, x7, x2
+	cmp	x4, x7
+	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
+	mov	x12, x7
+	add	x3, x3, x7
+	sub	x4, x4, x7
+
+Lseal_tail16_compose_extra_in:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x7, x7, #1
+	b.gt	Lseal_tail16_compose_extra_in
+
+	add	x3, x3, x12
+
+Lseal_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x1, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	Lseal_tail_16_compose
+
+	and	v0.16b, v0.16b, v21.16b
+	eor	v20.16b, v20.16b, v0.16b
+	mov	v21.16b, v20.16b
+
+Lseal_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	Lseal_tail_16_store
+
+    // Hash in the final ct block concatenated with extra_in
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lseal_hash_extra:
+	cbz	x4, Lseal_finalize
+
+Lseal_hash_extra_loop:
+	cmp	x4, #16
+	b.lt	Lseal_hash_extra_tail
+	ld1	{v20.16b}, [x3], #16
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #16
+	b	Lseal_hash_extra_loop
+
+Lseal_hash_extra_tail:
+	cbz	x4, Lseal_finalize
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
+	add	x3, x3, x4
+
+Lseal_hash_extra_load:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x3, #-1]!
+	mov	v20.b[0], w11
+	subs	x4, x4, #1
+	b.gt	Lseal_hash_extra_load
+
+    // Hash in the final padded extra_in blcok
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+Lseal_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Lseal_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+Lseal_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lseal_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+    // Only the first 32 bytes of the third block (counter = 0) are needed,
+    // so skip updating v12 and v17.
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+	b	Lseal_tail
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl	chacha20_poly1305_open
+
+.def chacha20_poly1305_open
+   .type 32
+.endef
+.align	6
+chacha20_poly1305_open:
+	AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+	stp	x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	w30, -72
+.cfi_offset	w29, -80
+	mov	x29, sp
+    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+    // we don't actually use the frame pointer like that, it's probably not
+    // worth bothering.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+.cfi_offset	b15, -8
+.cfi_offset	b14, -16
+.cfi_offset	b13, -24
+.cfi_offset	b12, -32
+.cfi_offset	b11, -40
+.cfi_offset	b10, -48
+.cfi_offset	b9, -56
+.cfi_offset	b8, -64
+
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+	ld1	{v28.16b - v30.16b}, [x5]
+
+	mov	x15, #1 // Prepare the Poly1305 state
+	mov	x8, #0
+	mov	x9, #0
+	mov	x10, #0
+
+	mov	v31.d[0], x4  // Store the input and aad lengths
+	mov	v31.d[1], x2
+
+	cmp	x2, #128
+	b.le	Lopen_128 // Optimization for smaller buffers
+
+    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+
+	mov	x6, #10
+
+.align	5
+Lopen_init_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lopen_init_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+
+	and	v0.16b, v0.16b, v27.16b
+	mov	x16, v0.d[0] // Move the R key to GPRs
+	mov	x17, v0.d[1]
+	mov	v27.16b, v5.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+Lopen_ad_done:
+	mov	x3, x1
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+Lopen_main_loop:
+
+	cmp	x2, #192
+	b.lt	Lopen_tail
+
+	adrp	x11, Lchacha20_consts
+	add	x11, x11, :lo12:Lchacha20_consts
+
+	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+	mov	v4.16b, v24.16b
+
+	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+	mov	v9.16b, v28.16b
+
+	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+	mov	v14.16b, v29.16b
+
+	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+	sub	x5, x5, #32
+	add	v15.4s, v15.4s, v25.4s
+	mov	v19.16b, v30.16b
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
+	sub	x4, x4, #10
+
+	mov	x7, #10
+	subs	x6, x7, x4
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+	cbz	x7, Lopen_main_loop_rounds_short
+
+.align	5
+Lopen_main_loop_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+Lopen_main_loop_rounds_short:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v18.8h, v18.8h
+	rev32	v19.8h, v19.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	eor	v8.16b, v8.16b, v13.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v9.4s, #20
+	sli	v8.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+	add	v4.4s, v4.4s, v8.4s
+
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	eor	v18.16b, v18.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	add	v13.4s, v13.4s, v18.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v14.16b
+
+	ushr	v9.4s, v8.4s, #25
+	sli	v9.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #4
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #12
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	add	v0.4s, v0.4s, v6.4s
+	add	v1.4s, v1.4s, v7.4s
+	add	v2.4s, v2.4s, v8.4s
+	add	v3.4s, v3.4s, v5.4s
+	add	v4.4s, v4.4s, v9.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	rev32	v18.8h, v18.8h
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+	rev32	v19.8h, v19.8h
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v6.16b, v6.16b, v12.16b
+	eor	v7.16b, v7.16b, v13.16b
+	eor	v8.16b, v8.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v9.16b, v9.16b, v14.16b
+
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+	ushr	v7.4s, v8.4s, #20
+	sli	v7.4s, v8.4s, #12
+	ushr	v8.4s, v5.4s, #20
+	sli	v8.4s, v5.4s, #12
+	ushr	v5.4s, v9.4s, #20
+	sli	v5.4s, v9.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	add	v3.4s, v3.4s, v8.4s
+	add	v4.4s, v4.4s, v5.4s
+
+	eor	v18.16b, v18.16b, v0.16b
+	eor	v15.16b, v15.16b, v1.16b
+	eor	v16.16b, v16.16b, v2.16b
+	eor	v17.16b, v17.16b, v3.16b
+	eor	v19.16b, v19.16b, v4.16b
+
+	tbl	v18.16b, {v18.16b}, v26.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+	tbl	v19.16b, {v19.16b}, v26.16b
+
+	add	v12.4s, v12.4s, v18.4s
+	add	v13.4s, v13.4s, v15.4s
+	add	v10.4s, v10.4s, v16.4s
+	add	v11.4s, v11.4s, v17.4s
+	add	v14.4s, v14.4s, v19.4s
+
+	eor	v20.16b, v20.16b, v12.16b
+	eor	v6.16b, v6.16b, v13.16b
+	eor	v7.16b, v7.16b, v10.16b
+	eor	v8.16b, v8.16b, v11.16b
+	eor	v5.16b, v5.16b, v14.16b
+
+	ushr	v9.4s, v5.4s, #25
+	sli	v9.4s, v5.4s, #7
+	ushr	v5.4s, v8.4s, #25
+	sli	v5.4s, v8.4s, #7
+	ushr	v8.4s, v7.4s, #25
+	sli	v8.4s, v7.4s, #7
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+
+	ext	v9.16b, v9.16b, v9.16b, #12
+	ext	v14.16b, v14.16b, v14.16b, #8
+	ext	v19.16b, v19.16b, v19.16b, #4
+	subs	x7, x7, #1
+	b.gt	Lopen_main_loop_rounds
+	subs	x6, x6, #1
+	b.ge	Lopen_main_loop_rounds_short
+
+	eor	v20.16b, v20.16b, v20.16b //zero
+	not	v21.16b, v20.16b // -1
+	sub	v21.4s, v25.4s, v21.4s // Add +1
+	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+	add	v19.4s, v19.4s, v20.4s
+
+	add	v15.4s, v15.4s, v25.4s
+	mov	x11, #5
+	dup	v20.4s, w11
+	add	v25.4s, v25.4s, v20.4s
+
+	zip1	v20.4s, v0.4s, v1.4s
+	zip2	v21.4s, v0.4s, v1.4s
+	zip1	v22.4s, v2.4s, v3.4s
+	zip2	v23.4s, v2.4s, v3.4s
+
+	zip1	v0.2d, v20.2d, v22.2d
+	zip2	v1.2d, v20.2d, v22.2d
+	zip1	v2.2d, v21.2d, v23.2d
+	zip2	v3.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v5.4s, v6.4s
+	zip2	v21.4s, v5.4s, v6.4s
+	zip1	v22.4s, v7.4s, v8.4s
+	zip2	v23.4s, v7.4s, v8.4s
+
+	zip1	v5.2d, v20.2d, v22.2d
+	zip2	v6.2d, v20.2d, v22.2d
+	zip1	v7.2d, v21.2d, v23.2d
+	zip2	v8.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v10.4s, v11.4s
+	zip2	v21.4s, v10.4s, v11.4s
+	zip1	v22.4s, v12.4s, v13.4s
+	zip2	v23.4s, v12.4s, v13.4s
+
+	zip1	v10.2d, v20.2d, v22.2d
+	zip2	v11.2d, v20.2d, v22.2d
+	zip1	v12.2d, v21.2d, v23.2d
+	zip2	v13.2d, v21.2d, v23.2d
+
+	zip1	v20.4s, v15.4s, v16.4s
+	zip2	v21.4s, v15.4s, v16.4s
+	zip1	v22.4s, v17.4s, v18.4s
+	zip2	v23.4s, v17.4s, v18.4s
+
+	zip1	v15.2d, v20.2d, v22.2d
+	zip2	v16.2d, v20.2d, v22.2d
+	zip1	v17.2d, v21.2d, v23.2d
+	zip2	v18.2d, v21.2d, v23.2d
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+
+	add	v1.4s, v1.4s, v24.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	add	v2.4s, v2.4s, v24.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v3.4s, v3.4s, v24.4s
+	add	v8.4s, v8.4s, v28.4s
+	add	v13.4s, v13.4s, v29.4s
+	add	v18.4s, v18.4s, v30.4s
+
+	add	v4.4s, v4.4s, v24.4s
+	add	v9.4s, v9.4s, v28.4s
+	add	v14.4s, v14.4s, v29.4s
+	add	v19.4s, v19.4s, v30.4s
+
+    // We can always safely store 192 bytes
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #192
+
+	mov	v0.16b, v3.16b
+	mov	v5.16b, v8.16b
+	mov	v10.16b, v13.16b
+	mov	v15.16b, v18.16b
+
+	cmp	x2, #64
+	b.lt	Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v3.16b
+	eor	v21.16b, v21.16b, v8.16b
+	eor	v22.16b, v22.16b, v13.16b
+	eor	v23.16b, v23.16b, v18.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v4.16b
+	mov	v5.16b, v9.16b
+	mov	v10.16b, v14.16b
+	mov	v15.16b, v19.16b
+
+	cmp	x2, #64
+	b.lt	Lopen_tail_64_store
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+	eor	v20.16b, v20.16b, v4.16b
+	eor	v21.16b, v21.16b, v9.16b
+	eor	v22.16b, v22.16b, v14.16b
+	eor	v23.16b, v23.16b, v19.16b
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+	b	Lopen_main_loop
+
+Lopen_tail:
+
+	cbz	x2, Lopen_finalize
+
+	lsr	x4, x2, #4 // How many whole blocks we have to hash
+
+	cmp	x2, #64
+	b.le	Lopen_tail_64
+	cmp	x2, #128
+	b.le	Lopen_tail_128
+
+Lopen_tail_192:
+     // We need three more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	mov	v17.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v21.16b, v21.16b, v21.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v21.d[0], x15
+
+	add	v22.4s, v23.4s, v21.4s
+	add	v21.4s, v22.4s, v21.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	mov	x7, #10
+	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
+	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+	sub	x4, x4, x7
+
+	cbz	x7, Lopen_tail_192_rounds_no_hash
+
+Lopen_tail_192_rounds:
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+Lopen_tail_192_rounds_no_hash:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x7, x7, #1
+	b.gt	Lopen_tail_192_rounds
+	subs	x6, x6, #1
+	b.ge	Lopen_tail_192_rounds_no_hash
+
+    // We hashed 160 bytes at most, may still have 32 bytes left
+Lopen_tail_192_hash:
+	cbz	x4, Lopen_tail_192_hash_done
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	Lopen_tail_192_hash
+
+Lopen_tail_192_hash_done:
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v12.4s, v12.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v17.4s, v17.4s, v30.4s
+
+	add	v15.4s, v15.4s, v21.4s
+	add	v16.4s, v16.4s, v23.4s
+	add	v17.4s, v17.4s, v22.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v2.16b
+	eor	v21.16b, v21.16b, v7.16b
+	eor	v22.16b, v22.16b, v12.16b
+	eor	v23.16b, v23.16b, v17.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #128
+	b	Lopen_tail_64_store
+
+Lopen_tail_128:
+     // We need two more blocks
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v15.16b, v30.16b
+	mov	v16.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	eor	v22.16b, v22.16b, v22.16b
+	ins	v23.s[0], v25.s[0]
+	ins	v22.d[0], x15
+	add	v22.4s, v22.4s, v23.4s
+
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+Lopen_tail_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	add	v1.4s, v1.4s, v6.4s
+	eor	v16.16b, v16.16b, v1.16b
+	rev32	v16.8h, v16.8h
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v6.16b, v6.16b, v11.16b
+	ushr	v20.4s, v6.4s, #20
+	sli	v20.4s, v6.4s, #12
+	add	v1.4s, v1.4s, v20.4s
+	eor	v16.16b, v16.16b, v1.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+
+	add	v11.4s, v11.4s, v16.4s
+	eor	v20.16b, v20.16b, v11.16b
+	ushr	v6.4s, v20.4s, #25
+	sli	v6.4s, v20.4s, #7
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v16.16b, v16.16b, v16.16b, #4
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_128_rounds
+	cbz	x4, Lopen_tail_128_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	Lopen_tail_128_rounds
+
+Lopen_tail_128_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v16.4s, v16.4s, v30.4s
+	add	v15.4s, v15.4s, v22.4s
+	add	v16.4s, v16.4s, v23.4s
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	eor	v20.16b, v20.16b, v1.16b
+	eor	v21.16b, v21.16b, v6.16b
+	eor	v22.16b, v22.16b, v11.16b
+	eor	v23.16b, v23.16b, v16.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+	sub	x2, x2, #64
+
+	b	Lopen_tail_64_store
+
+Lopen_tail_64:
+    // We just need a single block
+	mov	v0.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v15.16b, v30.16b
+	eor	v23.16b, v23.16b, v23.16b
+	ins	v23.s[0], v25.s[0]
+	add	v15.4s, v15.4s, v23.4s
+
+	mov	x6, #10
+	sub	x6, x6, x4
+
+Lopen_tail_64_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	eor	v15.16b, v15.16b, v0.16b
+	rev32	v15.8h, v15.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v5.16b, v5.16b, v10.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	add	v0.4s, v0.4s, v20.4s
+	eor	v15.16b, v15.16b, v0.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	eor	v20.16b, v20.16b, v10.16b
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v15.16b, v15.16b, v15.16b, #4
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_64_rounds
+	cbz	x4, Lopen_tail_64_rounds_done
+	subs	x4, x4, #1
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	b	Lopen_tail_64_rounds
+
+Lopen_tail_64_rounds_done:
+	add	v0.4s, v0.4s, v24.4s
+	add	v5.4s, v5.4s, v28.4s
+	add	v10.4s, v10.4s, v29.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v15.4s, v15.4s, v23.4s
+
+Lopen_tail_64_store:
+	cmp	x2, #16
+	b.lt	Lopen_tail_16
+
+	ld1	{v20.16b}, [x1], #16
+	eor	v20.16b, v20.16b, v0.16b
+	st1	{v20.16b}, [x0], #16
+	mov	v0.16b, v5.16b
+	mov	v5.16b, v10.16b
+	mov	v10.16b, v15.16b
+	sub	x2, x2, #16
+	b	Lopen_tail_64_store
+
+Lopen_tail_16:
+    // Here we handle the last [0,16) bytes that require a padded block
+	cbz	x2, Lopen_finalize
+
+	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
+	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
+	not	v22.16b, v20.16b
+
+	add	x7, x1, x2
+	mov	x6, x2
+
+Lopen_tail_16_compose:
+	ext	v20.16b, v20.16b, v20.16b, #15
+	ldrb	w11, [x7, #-1]!
+	mov	v20.b[0], w11
+	ext	v21.16b, v22.16b, v21.16b, #15
+	subs	x2, x2, #1
+	b.gt	Lopen_tail_16_compose
+
+	and	v20.16b, v20.16b, v21.16b
+    // Hash in the final padded block
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	eor	v20.16b, v20.16b, v0.16b
+
+Lopen_tail_16_store:
+	umov	w11, v20.b[0]
+	strb	w11, [x0], #1
+	ext	v20.16b, v20.16b, v20.16b, #1
+	subs	x6, x6, #1
+	b.gt	Lopen_tail_16_store
+
+Lopen_finalize:
+	mov	x11, v31.d[0]
+	mov	x12, v31.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+    // Final reduction step
+	sub	x12, xzr, x15
+	orr	x13, xzr, #3
+	subs	x11, x8, #-5
+	sbcs	x12, x9, x12
+	sbcs	x13, x10, x13
+	csel	x8, x11, x8, cs
+	csel	x9, x12, x9, cs
+	csel	x10, x13, x10, cs
+	mov	x11, v27.d[0]
+	mov	x12, v27.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+
+	stp	x8, x9, [x5]
+
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+.cfi_restore	b15
+.cfi_restore	b14
+.cfi_restore	b13
+.cfi_restore	b12
+.cfi_restore	b11
+.cfi_restore	b10
+.cfi_restore	b9
+.cfi_restore	b8
+	ldp	x29, x30, [sp], 80
+.cfi_restore	w29
+.cfi_restore	w30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Lopen_128:
+    // On some architectures preparing 5 blocks for small buffers is wasteful
+	eor	v25.16b, v25.16b, v25.16b
+	mov	x11, #1
+	mov	v25.s[0], w11
+	mov	v0.16b, v24.16b
+	mov	v1.16b, v24.16b
+	mov	v2.16b, v24.16b
+	mov	v5.16b, v28.16b
+	mov	v6.16b, v28.16b
+	mov	v7.16b, v28.16b
+	mov	v10.16b, v29.16b
+	mov	v11.16b, v29.16b
+	mov	v12.16b, v29.16b
+	mov	v17.16b, v30.16b
+	add	v15.4s, v17.4s, v25.4s
+	add	v16.4s, v15.4s, v25.4s
+
+	mov	x6, #10
+
+Lopen_128_rounds:
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #4
+	ext	v6.16b, v6.16b, v6.16b, #4
+	ext	v7.16b, v7.16b, v7.16b, #4
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #12
+	ext	v16.16b, v16.16b, v16.16b, #12
+	ext	v17.16b, v17.16b, v17.16b, #12
+	add	v0.4s, v0.4s, v5.4s
+	add	v1.4s, v1.4s, v6.4s
+	add	v2.4s, v2.4s, v7.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	rev32	v15.8h, v15.8h
+	rev32	v16.8h, v16.8h
+	rev32	v17.8h, v17.8h
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v5.16b, v5.16b, v10.16b
+	eor	v6.16b, v6.16b, v11.16b
+	eor	v7.16b, v7.16b, v12.16b
+	ushr	v20.4s, v5.4s, #20
+	sli	v20.4s, v5.4s, #12
+	ushr	v5.4s, v6.4s, #20
+	sli	v5.4s, v6.4s, #12
+	ushr	v6.4s, v7.4s, #20
+	sli	v6.4s, v7.4s, #12
+
+	add	v0.4s, v0.4s, v20.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	eor	v15.16b, v15.16b, v0.16b
+	eor	v16.16b, v16.16b, v1.16b
+	eor	v17.16b, v17.16b, v2.16b
+	tbl	v15.16b, {v15.16b}, v26.16b
+	tbl	v16.16b, {v16.16b}, v26.16b
+	tbl	v17.16b, {v17.16b}, v26.16b
+
+	add	v10.4s, v10.4s, v15.4s
+	add	v11.4s, v11.4s, v16.4s
+	add	v12.4s, v12.4s, v17.4s
+	eor	v20.16b, v20.16b, v10.16b
+	eor	v5.16b, v5.16b, v11.16b
+	eor	v6.16b, v6.16b, v12.16b
+	ushr	v7.4s, v6.4s, #25
+	sli	v7.4s, v6.4s, #7
+	ushr	v6.4s, v5.4s, #25
+	sli	v6.4s, v5.4s, #7
+	ushr	v5.4s, v20.4s, #25
+	sli	v5.4s, v20.4s, #7
+
+	ext	v5.16b, v5.16b, v5.16b, #12
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v7.16b, v7.16b, v7.16b, #12
+
+	ext	v10.16b, v10.16b, v10.16b, #8
+	ext	v11.16b, v11.16b, v11.16b, #8
+	ext	v12.16b, v12.16b, v12.16b, #8
+
+	ext	v15.16b, v15.16b, v15.16b, #4
+	ext	v16.16b, v16.16b, v16.16b, #4
+	ext	v17.16b, v17.16b, v17.16b, #4
+	subs	x6, x6, #1
+	b.hi	Lopen_128_rounds
+
+	add	v0.4s, v0.4s, v24.4s
+	add	v1.4s, v1.4s, v24.4s
+	add	v2.4s, v2.4s, v24.4s
+
+	add	v5.4s, v5.4s, v28.4s
+	add	v6.4s, v6.4s, v28.4s
+	add	v7.4s, v7.4s, v28.4s
+
+	add	v10.4s, v10.4s, v29.4s
+	add	v11.4s, v11.4s, v29.4s
+
+	add	v30.4s, v30.4s, v25.4s
+	add	v15.4s, v15.4s, v30.4s
+	add	v30.4s, v30.4s, v25.4s
+	add	v16.4s, v16.4s, v30.4s
+
+	and	v2.16b, v2.16b, v27.16b
+	mov	x16, v2.d[0] // Move the R key to GPRs
+	mov	x17, v2.d[1]
+	mov	v27.16b, v7.16b // Store the S key
+
+	bl	Lpoly_hash_ad_internal
+
+Lopen_128_store:
+	cmp	x2, #64
+	b.lt	Lopen_128_store_64
+
+	ld1	{v20.16b - v23.16b}, [x1], #64
+
+	mov	x11, v20.d[0]
+	mov	x12, v20.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v21.d[0]
+	mov	x12, v21.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v22.d[0]
+	mov	x12, v22.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	mov	x11, v23.d[0]
+	mov	x12, v23.d[1]
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+
+	eor	v20.16b, v20.16b, v0.16b
+	eor	v21.16b, v21.16b, v5.16b
+	eor	v22.16b, v22.16b, v10.16b
+	eor	v23.16b, v23.16b, v15.16b
+
+	st1	{v20.16b - v23.16b}, [x0], #64
+
+	sub	x2, x2, #64
+
+	mov	v0.16b, v1.16b
+	mov	v5.16b, v6.16b
+	mov	v10.16b, v11.16b
+	mov	v15.16b, v16.16b
+
+Lopen_128_store_64:
+
+	lsr	x4, x2, #4
+	mov	x3, x1
+
+Lopen_128_hash_64:
+	cbz	x4, Lopen_tail_64_store
+	ldp	x11, x12, [x3], 16
+	adds	x8, x8, x11
+	adcs	x9, x9, x12
+	adc	x10, x10, x15
+	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+	umulh	x12, x8, x16
+	mul	x13, x9, x16
+	umulh	x14, x9, x16
+	adds	x12, x12, x13
+	mul	x13, x10, x16
+	adc	x13, x13, x14
+	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+	umulh	x8, x8, x17
+	adds	x12, x12, x14
+	mul	x14, x9, x17
+	umulh	x9, x9, x17
+	adcs	x14, x14, x8
+	mul	x10, x10, x17
+	adc	x10, x10, x9
+	adds	x13, x13, x14
+	adc	x14, x10, xzr
+	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
+	and	x8, x13, #-4
+	extr	x13, x14, x13, #2
+	adds	x8, x8, x11
+	lsr	x11, x14, #2
+	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
+	adds	x8, x8, x13
+	adcs	x9, x9, x12
+	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
+	sub	x4, x4, #1
+	b	Lopen_128_hash_64
+.cfi_endproc
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-apple.S
index b4ff5f05..7ca074e7 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-apple.S
@@ -1,18 +1,13 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
 #include <CCryptoBoringSSL_asm_base.h>
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-chacha20_poly1305_constants:
-
 .section	__DATA,__const
 .p2align	6
+chacha20_poly1305_constants:
 L$chacha20_consts:
 .byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
 .byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
@@ -218,11 +213,11 @@ L$hash_ad_done:
 
 
 
-.globl	_chacha20_poly1305_open
-.private_extern _chacha20_poly1305_open
+.globl	_chacha20_poly1305_open_nohw
+.private_extern _chacha20_poly1305_open_nohw
 
 .p2align	6
-_chacha20_poly1305_open:
+_chacha20_poly1305_open_nohw:
 
 _CET_ENDBR
 	pushq	%rbp
@@ -251,11 +246,6 @@ _CET_ENDBR
 	movq	%r8,0+0+32(%rbp)
 	movq	%rbx,8+0+32(%rbp)
 
-	movl	_OPENSSL_ia32cap_P+8(%rip),%eax
-	andl	$288,%eax
-	xorl	$288,%eax
-	jz	chacha20_poly1305_open_avx2
-
 	cmpq	$128,%rbx
 	jbe	L$open_sse_128
 
@@ -2090,11 +2080,11 @@ L$open_sse_128_xor_hash:
 
 
 
-.globl	_chacha20_poly1305_seal
-.private_extern _chacha20_poly1305_seal
+.globl	_chacha20_poly1305_seal_nohw
+.private_extern _chacha20_poly1305_seal_nohw
 
 .p2align	6
-_chacha20_poly1305_seal:
+_chacha20_poly1305_seal_nohw:
 
 _CET_ENDBR
 	pushq	%rbp
@@ -2124,11 +2114,6 @@ _CET_ENDBR
 	movq	%rbx,8+0+32(%rbp)
 	movq	%rdx,%rbx
 
-	movl	_OPENSSL_ia32cap_P+8(%rip),%eax
-	andl	$288,%eax
-	xorl	$288,%eax
-	jz	chacha20_poly1305_seal_avx2
-
 	cmpq	$128,%rbx
 	jbe	L$seal_sse_128
 
@@ -4077,20 +4062,38 @@ L$seal_sse_128_rounds:
 
 
 
+.globl	_chacha20_poly1305_open_avx2
+.private_extern _chacha20_poly1305_open_avx2
 
 .p2align	6
-chacha20_poly1305_open_avx2:
+_chacha20_poly1305_open_avx2:
 
+_CET_ENDBR
+	pushq	%rbp
 
+	pushq	%rbx
 
+	pushq	%r12
 
+	pushq	%r13
 
+	pushq	%r14
+
+	pushq	%r15
 
 
 
+	pushq	%r9
 
+	subq	$288 + 0 + 32,%rsp
 
 
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
 
 	vzeroupper
 	vmovdqa	L$chacha20_consts(%rip),%ymm0
@@ -6225,20 +6228,39 @@ L$open_avx2_320_rounds:
 
 
 
+.globl	_chacha20_poly1305_seal_avx2
+.private_extern _chacha20_poly1305_seal_avx2
 
 .p2align	6
-chacha20_poly1305_seal_avx2:
+_chacha20_poly1305_seal_avx2:
+
+_CET_ENDBR
+	pushq	%rbp
 
+	pushq	%rbx
 
+	pushq	%r12
 
+	pushq	%r13
 
+	pushq	%r14
 
+	pushq	%r15
 
 
 
+	pushq	%r9
 
+	subq	$288 + 0 + 32,%rsp
 
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
 
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
 
 	vzeroupper
 	vmovdqa	L$chacha20_consts(%rip),%ymm0
@@ -8875,7 +8897,6 @@ L$seal_avx2_exit:
 
 
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-linux.S
index 3f3c641d..d80e89dd 100644
--- a/Sources/CCryptoBoringSSL/crypto/cipher_extra/chacha20_poly1305_x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/chacha20_poly1305_x86_64-linux.S
@@ -1,19 +1,13 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
 #include <CCryptoBoringSSL_asm_base.h>
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
-.text	
-.extern	OPENSSL_ia32cap_P
-.hidden OPENSSL_ia32cap_P
-
-chacha20_poly1305_constants:
-
 .section	.rodata
 .align	64
+chacha20_poly1305_constants:
 .Lchacha20_consts:
 .byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
 .byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
@@ -219,11 +213,11 @@ poly_hash_ad_internal:
 .cfi_endproc	
 .size	poly_hash_ad_internal, .-poly_hash_ad_internal
 
-.globl	chacha20_poly1305_open
-.hidden chacha20_poly1305_open
-.type	chacha20_poly1305_open,@function
+.globl	chacha20_poly1305_open_nohw
+.hidden chacha20_poly1305_open_nohw
+.type	chacha20_poly1305_open_nohw,@function
 .align	64
-chacha20_poly1305_open:
+chacha20_poly1305_open_nohw:
 .cfi_startproc	
 _CET_ENDBR
 	pushq	%rbp
@@ -259,11 +253,6 @@ _CET_ENDBR
 	movq	%r8,0+0+32(%rbp)
 	movq	%rbx,8+0+32(%rbp)
 
-	movl	OPENSSL_ia32cap_P+8(%rip),%eax
-	andl	$288,%eax
-	xorl	$288,%eax
-	jz	chacha20_poly1305_open_avx2
-
 	cmpq	$128,%rbx
 	jbe	.Lopen_sse_128
 
@@ -2096,7 +2085,7 @@ _CET_ENDBR
 	movdqa	%xmm10,%xmm6
 	movdqa	%xmm14,%xmm10
 	jmp	.Lopen_sse_128_xor_hash
-.size	chacha20_poly1305_open, .-chacha20_poly1305_open
+.size	chacha20_poly1305_open_nohw, .-chacha20_poly1305_open_nohw
 .cfi_endproc	
 
 
@@ -2105,11 +2094,11 @@ _CET_ENDBR
 
 
 
-.globl	chacha20_poly1305_seal
-.hidden chacha20_poly1305_seal
-.type	chacha20_poly1305_seal,@function
+.globl	chacha20_poly1305_seal_nohw
+.hidden chacha20_poly1305_seal_nohw
+.type	chacha20_poly1305_seal_nohw,@function
 .align	64
-chacha20_poly1305_seal:
+chacha20_poly1305_seal_nohw:
 .cfi_startproc	
 _CET_ENDBR
 	pushq	%rbp
@@ -2146,11 +2135,6 @@ _CET_ENDBR
 	movq	%rbx,8+0+32(%rbp)
 	movq	%rdx,%rbx
 
-	movl	OPENSSL_ia32cap_P+8(%rip),%eax
-	andl	$288,%eax
-	xorl	$288,%eax
-	jz	chacha20_poly1305_seal_avx2
-
 	cmpq	$128,%rbx
 	jbe	.Lseal_sse_128
 
@@ -4102,32 +4086,50 @@ process_extra_in_trailer:
 	movq	%r8,%r8
 	call	poly_hash_ad_internal
 	jmp	.Lseal_sse_128_tail_xor
-.size	chacha20_poly1305_seal, .-chacha20_poly1305_seal
+.size	chacha20_poly1305_seal_nohw, .-chacha20_poly1305_seal_nohw
 .cfi_endproc	
 
 
+.globl	chacha20_poly1305_open_avx2
+.hidden chacha20_poly1305_open_avx2
 .type	chacha20_poly1305_open_avx2,@function
 .align	64
 chacha20_poly1305_open_avx2:
 .cfi_startproc	
-
-
+_CET_ENDBR
+	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
+	pushq	%rbx
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbx,-24
+	pushq	%r12
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r12,-32
+	pushq	%r13
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r13,-40
+	pushq	%r14
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r14,-48
+	pushq	%r15
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r15,-56
+
+
+	pushq	%r9
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
 .cfi_adjust_cfa_offset	288 + 32
 
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+
 	vzeroupper
 	vmovdqa	.Lchacha20_consts(%rip),%ymm0
 	vbroadcasti128	0(%r9),%ymm4
@@ -6261,27 +6263,46 @@ chacha20_poly1305_open_avx2:
 .cfi_endproc	
 
 
+.globl	chacha20_poly1305_seal_avx2
+.hidden chacha20_poly1305_seal_avx2
 .type	chacha20_poly1305_seal_avx2,@function
 .align	64
 chacha20_poly1305_seal_avx2:
 .cfi_startproc	
-
-
+_CET_ENDBR
+	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
+	pushq	%rbx
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbx,-24
+	pushq	%r12
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r12,-32
+	pushq	%r13
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r13,-40
+	pushq	%r14
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r14,-48
+	pushq	%r15
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r15,-56
+
+
+	pushq	%r9
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
 .cfi_adjust_cfa_offset	288 + 32
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
 
 	vzeroupper
 	vmovdqa	.Lchacha20_consts(%rip),%ymm0
@@ -8918,7 +8939,6 @@ chacha20_poly1305_seal_avx2:
 .cfi_endproc	
 .size	chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/err/err_data.c b/Sources/CCryptoBoringSSL/gen/crypto/err_data.c
similarity index 78%
rename from Sources/CCryptoBoringSSL/crypto/err/err_data.c
rename to Sources/CCryptoBoringSSL/gen/crypto/err_data.c
index 1068c5f6..d7612384 100644
--- a/Sources/CCryptoBoringSSL/crypto/err/err_data.c
+++ b/Sources/CCryptoBoringSSL/gen/crypto/err_data.c
@@ -12,7 +12,7 @@
  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 
- /* This file was generated by err_data_generate.go. */
+ /* This file was generated by go run ./util/pregenerate. */
 
 #include <CCryptoBoringSSL_base.h>
 #include <CCryptoBoringSSL_err.h>
@@ -76,54 +76,54 @@ const uint32_t kOpenSSLReasonValues[] = {
     0xc3b00f7,
     0xc3b8921,
     0x10320892,
-    0x10329641,
-    0x1033164d,
-    0x10339666,
-    0x10341679,
+    0x10329654,
+    0x10331660,
+    0x10339679,
+    0x1034168c,
     0x10348f93,
     0x10350cdf,
-    0x1035968c,
-    0x103616b6,
-    0x103696c9,
-    0x103716e8,
-    0x10379701,
-    0x10381716,
-    0x10389734,
-    0x10391743,
-    0x1039975f,
-    0x103a177a,
-    0x103a9789,
-    0x103b17a5,
-    0x103b97c0,
-    0x103c17e6,
+    0x1035969f,
+    0x103616c9,
+    0x103696dc,
+    0x103716fb,
+    0x10379714,
+    0x10381729,
+    0x10389747,
+    0x10391756,
+    0x10399772,
+    0x103a178d,
+    0x103a979c,
+    0x103b17b8,
+    0x103b97d3,
+    0x103c17f9,
     0x103c80f7,
-    0x103d17f7,
-    0x103d980b,
-    0x103e182a,
-    0x103e9839,
-    0x103f1850,
-    0x103f9863,
+    0x103d180a,
+    0x103d981e,
+    0x103e183d,
+    0x103e984c,
+    0x103f1863,
+    0x103f9876,
     0x10400ca3,
-    0x10409876,
-    0x10411894,
-    0x104198a7,
-    0x104218c1,
-    0x104298d1,
-    0x104318e5,
-    0x104398fb,
-    0x10441913,
-    0x10449928,
-    0x1045193c,
-    0x1045994e,
+    0x10409889,
+    0x104118a7,
+    0x104198ba,
+    0x104218d4,
+    0x104298e4,
+    0x104318f8,
+    0x1043990e,
+    0x10441926,
+    0x1044993b,
+    0x1045194f,
+    0x10459961,
     0x10460635,
     0x1046899a,
-    0x10471963,
-    0x1047997a,
-    0x1048198f,
-    0x1048999d,
+    0x10471976,
+    0x1047998d,
+    0x104819a2,
+    0x104899b0,
     0x10490edf,
-    0x104997d7,
-    0x104a16a1,
+    0x104997ea,
+    0x104a16b4,
     0x14320c73,
     0x14328c94,
     0x14330ca3,
@@ -139,53 +139,54 @@ const uint32_t kOpenSSLReasonValues[] = {
     0x183480f7,
     0x18351032,
     0x1835904a,
-    0x1836105f,
-    0x18369073,
-    0x183710ab,
-    0x183790c1,
-    0x183810d5,
-    0x183890e5,
+    0x18361072,
+    0x18369086,
+    0x183710be,
+    0x183790d4,
+    0x183810e8,
+    0x183890f8,
     0x18390ac0,
-    0x183990f5,
-    0x183a111b,
-    0x183a9141,
+    0x18399108,
+    0x183a112e,
+    0x183a9154,
     0x183b0ceb,
-    0x183b9190,
-    0x183c11a2,
-    0x183c91ad,
-    0x183d11bd,
-    0x183d91ce,
-    0x183e11df,
-    0x183e91f1,
-    0x183f121a,
-    0x183f9233,
-    0x1840124b,
+    0x183b91a3,
+    0x183c11b5,
+    0x183c91c0,
+    0x183d11d0,
+    0x183d91e1,
+    0x183e11f2,
+    0x183e9204,
+    0x183f122d,
+    0x183f9246,
+    0x1840125e,
     0x1840870d,
-    0x18411164,
-    0x1841912f,
-    0x1842114e,
+    0x18411177,
+    0x18419142,
+    0x18421161,
     0x18428c81,
-    0x1843110a,
-    0x18439176,
+    0x1843111d,
+    0x18439189,
     0x18441028,
-    0x18449097,
-    0x20321285,
-    0x20329272,
-    0x24321291,
+    0x184490aa,
+    0x1845105f,
+    0x20321298,
+    0x20329285,
+    0x243212a4,
     0x243289e0,
-    0x243312a3,
-    0x243392b0,
-    0x243412bd,
-    0x243492cf,
-    0x243512de,
-    0x243592fb,
-    0x24361308,
-    0x24369316,
-    0x24371324,
-    0x24379332,
-    0x2438133b,
-    0x24389348,
-    0x2439135b,
+    0x243312b6,
+    0x243392c3,
+    0x243412d0,
+    0x243492e2,
+    0x243512f1,
+    0x2435930e,
+    0x2436131b,
+    0x24369329,
+    0x24371337,
+    0x24379345,
+    0x2438134e,
+    0x2438935b,
+    0x2439136e,
     0x28320cd3,
     0x28328ceb,
     0x28330ca3,
@@ -195,51 +196,51 @@ const uint32_t kOpenSSLReasonValues[] = {
     0x283500f7,
     0x28358c81,
     0x2836099a,
-    0x2c3232e7,
-    0x2c329372,
-    0x2c3332f5,
-    0x2c33b307,
-    0x2c34331b,
-    0x2c34b32d,
-    0x2c353348,
-    0x2c35b35a,
-    0x2c36338a,
+    0x2c3232fa,
+    0x2c329385,
+    0x2c333308,
+    0x2c33b31a,
+    0x2c34332e,
+    0x2c34b340,
+    0x2c35335b,
+    0x2c35b36d,
+    0x2c36339d,
     0x2c36833a,
-    0x2c373397,
-    0x2c37b3c3,
-    0x2c383401,
-    0x2c38b418,
-    0x2c393436,
-    0x2c39b446,
-    0x2c3a3458,
-    0x2c3ab46c,
-    0x2c3b347d,
-    0x2c3bb49c,
-    0x2c3c1384,
-    0x2c3c939a,
-    0x2c3d34e1,
-    0x2c3d93b3,
-    0x2c3e350b,
-    0x2c3eb519,
-    0x2c3f3531,
-    0x2c3fb549,
-    0x2c403573,
-    0x2c409285,
-    0x2c413584,
-    0x2c41b597,
-    0x2c42124b,
-    0x2c42b5a8,
+    0x2c3733aa,
+    0x2c37b3d6,
+    0x2c383414,
+    0x2c38b42b,
+    0x2c393449,
+    0x2c39b459,
+    0x2c3a346b,
+    0x2c3ab47f,
+    0x2c3b3490,
+    0x2c3bb4af,
+    0x2c3c1397,
+    0x2c3c93ad,
+    0x2c3d34f4,
+    0x2c3d93c6,
+    0x2c3e351e,
+    0x2c3eb52c,
+    0x2c3f3544,
+    0x2c3fb55c,
+    0x2c403586,
+    0x2c409298,
+    0x2c413597,
+    0x2c41b5aa,
+    0x2c42125e,
+    0x2c42b5bb,
     0x2c43076d,
-    0x2c43b48e,
-    0x2c4433d6,
-    0x2c44b556,
-    0x2c45336d,
-    0x2c45b3a9,
-    0x2c463426,
-    0x2c46b4b0,
-    0x2c4734c5,
-    0x2c47b4fe,
-    0x2c4833e8,
+    0x2c43b4a1,
+    0x2c4433e9,
+    0x2c44b569,
+    0x2c453380,
+    0x2c45b3bc,
+    0x2c463439,
+    0x2c46b4c3,
+    0x2c4734d8,
+    0x2c47b511,
+    0x2c4833fb,
     0x30320000,
     0x30328015,
     0x3033001f,
@@ -379,261 +380,261 @@ const uint32_t kOpenSSLReasonValues[] = {
     0x3c418dd3,
     0x3c420edf,
     0x3c428e69,
-    0x40321a2f,
-    0x40329a45,
-    0x40331a73,
-    0x40339a7d,
-    0x40341a94,
-    0x40349ab2,
-    0x40351ac2,
-    0x40359ad4,
-    0x40361ae1,
-    0x40369aed,
-    0x40371b02,
-    0x40379b14,
-    0x40381b1f,
-    0x40389b31,
+    0x40321a42,
+    0x40329a58,
+    0x40331a86,
+    0x40339a90,
+    0x40341aa7,
+    0x40349ac5,
+    0x40351ad5,
+    0x40359ae7,
+    0x40361af4,
+    0x40369b00,
+    0x40371b15,
+    0x40379b27,
+    0x40381b32,
+    0x40389b44,
     0x40390f93,
-    0x40399b41,
-    0x403a1b54,
-    0x403a9b75,
-    0x403b1b86,
-    0x403b9b96,
+    0x40399b54,
+    0x403a1b67,
+    0x403a9b88,
+    0x403b1b99,
+    0x403b9ba9,
     0x403c0071,
     0x403c8090,
-    0x403d1bf7,
-    0x403d9c0d,
-    0x403e1c1c,
-    0x403e9c54,
-    0x403f1c6e,
-    0x403f9c96,
-    0x40401cab,
-    0x40409cbf,
-    0x40411cfa,
-    0x40419d15,
-    0x40421d2e,
-    0x40429d41,
-    0x40431d55,
-    0x40439d83,
-    0x40441d9a,
+    0x403d1c0a,
+    0x403d9c20,
+    0x403e1c2f,
+    0x403e9c67,
+    0x403f1c81,
+    0x403f9ca9,
+    0x40401cbe,
+    0x40409cd2,
+    0x40411d0d,
+    0x40419d28,
+    0x40421d41,
+    0x40429d54,
+    0x40431d68,
+    0x40439d96,
+    0x40441dad,
     0x404480b9,
-    0x40451daf,
-    0x40459dc1,
-    0x40461de5,
-    0x40469e05,
-    0x40471e13,
-    0x40479e3a,
-    0x40481eab,
-    0x40489f65,
-    0x40491f7c,
-    0x40499f96,
-    0x404a1fad,
-    0x404a9fcb,
-    0x404b1fe3,
-    0x404ba010,
-    0x404c2026,
-    0x404ca038,
-    0x404d2059,
-    0x404da092,
-    0x404e20a6,
-    0x404ea0b3,
-    0x404f2164,
-    0x404fa1da,
-    0x40502249,
-    0x4050a25d,
-    0x40512290,
-    0x405222a0,
-    0x4052a2c4,
-    0x405322dc,
-    0x4053a2ef,
-    0x40542304,
-    0x4054a327,
-    0x40552352,
-    0x4055a38f,
-    0x405623b4,
-    0x4056a3cd,
-    0x405723e5,
-    0x4057a3f8,
-    0x4058240d,
-    0x4058a434,
-    0x40592463,
-    0x4059a490,
-    0x405aa4a4,
-    0x405b24bc,
-    0x405ba4cd,
-    0x405c24e0,
-    0x405ca51f,
-    0x405d252c,
-    0x405da551,
-    0x405e258f,
+    0x40451dc2,
+    0x40459dd4,
+    0x40461df8,
+    0x40469e18,
+    0x40471e26,
+    0x40479e4d,
+    0x40481ebe,
+    0x40489f78,
+    0x40491f8f,
+    0x40499fa9,
+    0x404a1fc0,
+    0x404a9fde,
+    0x404b1ff6,
+    0x404ba023,
+    0x404c2039,
+    0x404ca04b,
+    0x404d206c,
+    0x404da0a5,
+    0x404e20b9,
+    0x404ea0c6,
+    0x404f2177,
+    0x404fa1ed,
+    0x4050225c,
+    0x4050a270,
+    0x405122a3,
+    0x405222b3,
+    0x4052a2d7,
+    0x405322ef,
+    0x4053a302,
+    0x40542317,
+    0x4054a33a,
+    0x40552365,
+    0x4055a3a2,
+    0x405623c7,
+    0x4056a3e0,
+    0x405723f8,
+    0x4057a40b,
+    0x40582420,
+    0x4058a447,
+    0x40592476,
+    0x4059a4a3,
+    0x405aa4b7,
+    0x405b24cf,
+    0x405ba4e0,
+    0x405c24f3,
+    0x405ca532,
+    0x405d253f,
+    0x405da564,
+    0x405e25a2,
     0x405e8afe,
-    0x405f25b0,
-    0x405fa5bd,
-    0x406025cb,
-    0x4060a5ed,
-    0x4061264e,
-    0x4061a686,
-    0x4062269d,
-    0x4062a6ae,
-    0x406326fb,
-    0x4063a710,
-    0x40642727,
-    0x4064a753,
-    0x4065276e,
-    0x4065a785,
-    0x4066279d,
-    0x4066a7c7,
-    0x406727f2,
-    0x4067a837,
-    0x4068287f,
-    0x4068a8a0,
-    0x406928d2,
-    0x4069a900,
-    0x406a2921,
-    0x406aa941,
-    0x406b2ac9,
-    0x406baaec,
-    0x406c2b02,
-    0x406cae0c,
-    0x406d2e3b,
-    0x406dae63,
-    0x406e2e91,
-    0x406eaede,
-    0x406f2f37,
-    0x406faf6f,
-    0x40702f82,
-    0x4070af9f,
+    0x405f25c3,
+    0x405fa5d0,
+    0x406025de,
+    0x4060a600,
+    0x40612661,
+    0x4061a699,
+    0x406226b0,
+    0x4062a6c1,
+    0x4063270e,
+    0x4063a723,
+    0x4064273a,
+    0x4064a766,
+    0x40652781,
+    0x4065a798,
+    0x406627b0,
+    0x4066a7da,
+    0x40672805,
+    0x4067a84a,
+    0x40682892,
+    0x4068a8b3,
+    0x406928e5,
+    0x4069a913,
+    0x406a2934,
+    0x406aa954,
+    0x406b2adc,
+    0x406baaff,
+    0x406c2b15,
+    0x406cae1f,
+    0x406d2e4e,
+    0x406dae76,
+    0x406e2ea4,
+    0x406eaef1,
+    0x406f2f4a,
+    0x406faf82,
+    0x40702f95,
+    0x4070afb2,
     0x4071084d,
-    0x4071afb1,
-    0x40722fc4,
-    0x4072affa,
-    0x40733012,
-    0x4073959c,
-    0x40743026,
-    0x4074b040,
-    0x40753051,
-    0x4075b065,
-    0x40763073,
-    0x40769348,
-    0x40773098,
-    0x4077b0d8,
-    0x407830f3,
-    0x4078b12c,
-    0x40793143,
-    0x4079b159,
-    0x407a3185,
-    0x407ab198,
-    0x407b31ad,
-    0x407bb1bf,
-    0x407c31f0,
-    0x407cb1f9,
-    0x407d28bb,
-    0x407da202,
-    0x407e3108,
-    0x407ea444,
-    0x407f1e27,
-    0x407f9ffa,
-    0x40802174,
-    0x40809e4f,
-    0x408122b2,
-    0x4081a101,
-    0x40822e7c,
-    0x40829ba2,
-    0x4083241f,
-    0x4083a738,
-    0x40841e63,
-    0x4084a47c,
-    0x408524f1,
-    0x4085a615,
-    0x40862571,
-    0x4086a21c,
-    0x40872ec2,
-    0x4087a663,
-    0x40881be0,
-    0x4088a84a,
-    0x40891c2f,
-    0x40899bbc,
-    0x408a2b3a,
-    0x408a99b4,
-    0x408b31d4,
-    0x408baf4c,
-    0x408c2501,
-    0x408c99ec,
-    0x408d1f4b,
-    0x408d9e95,
-    0x408e207b,
-    0x408ea36f,
-    0x408f285e,
-    0x408fa631,
-    0x40902813,
-    0x4090a543,
-    0x40912b22,
-    0x40919a12,
-    0x40921c7c,
-    0x4092aefd,
-    0x40932fdd,
-    0x4093a22d,
-    0x40941e77,
-    0x4094ab53,
-    0x409526bf,
-    0x4095b165,
-    0x40962ea9,
-    0x4096a18d,
-    0x40972278,
-    0x4097a0ca,
-    0x40981cdc,
-    0x4098a6d3,
-    0x40992f19,
-    0x4099a39c,
-    0x409a2335,
-    0x409a99d0,
-    0x409b1ed1,
-    0x409b9efc,
-    0x409c30ba,
-    0x409c9f24,
-    0x409d2149,
-    0x409da117,
-    0x409e1d6d,
-    0x409ea1c2,
-    0x409f21aa,
-    0x409f9ec4,
-    0x40a021ea,
-    0x40a0a0e4,
-    0x40a12132,
-    0x41f429f4,
-    0x41f92a86,
-    0x41fe2979,
-    0x41feac2f,
-    0x41ff2d5d,
-    0x42032a0d,
-    0x42082a2f,
-    0x4208aa6b,
-    0x4209295d,
-    0x4209aaa5,
-    0x420a29b4,
-    0x420aa994,
-    0x420b29d4,
-    0x420baa4d,
-    0x420c2d79,
-    0x420cab63,
-    0x420d2c16,
-    0x420dac4d,
-    0x42122c80,
-    0x42172d40,
-    0x4217acc2,
-    0x421c2ce4,
-    0x421f2c9f,
-    0x42212df1,
-    0x42262d23,
-    0x422b2dcf,
-    0x422babf1,
-    0x422c2db1,
-    0x422caba4,
-    0x422d2b7d,
-    0x422dad90,
-    0x422e2bd0,
-    0x42302cff,
-    0x4230ac67,
+    0x4071afc4,
+    0x40722fd7,
+    0x4072b00d,
+    0x40733025,
+    0x407395af,
+    0x40743039,
+    0x4074b053,
+    0x40753064,
+    0x4075b078,
+    0x40763086,
+    0x4076935b,
+    0x407730ab,
+    0x4077b0eb,
+    0x40783106,
+    0x4078b13f,
+    0x40793156,
+    0x4079b16c,
+    0x407a3198,
+    0x407ab1ab,
+    0x407b31c0,
+    0x407bb1d2,
+    0x407c3203,
+    0x407cb20c,
+    0x407d28ce,
+    0x407da215,
+    0x407e311b,
+    0x407ea457,
+    0x407f1e3a,
+    0x407fa00d,
+    0x40802187,
+    0x40809e62,
+    0x408122c5,
+    0x4081a114,
+    0x40822e8f,
+    0x40829bb5,
+    0x40832432,
+    0x4083a74b,
+    0x40841e76,
+    0x4084a48f,
+    0x40852504,
+    0x4085a628,
+    0x40862584,
+    0x4086a22f,
+    0x40872ed5,
+    0x4087a676,
+    0x40881bf3,
+    0x4088a85d,
+    0x40891c42,
+    0x40899bcf,
+    0x408a2b4d,
+    0x408a99c7,
+    0x408b31e7,
+    0x408baf5f,
+    0x408c2514,
+    0x408c99ff,
+    0x408d1f5e,
+    0x408d9ea8,
+    0x408e208e,
+    0x408ea382,
+    0x408f2871,
+    0x408fa644,
+    0x40902826,
+    0x4090a556,
+    0x40912b35,
+    0x40919a25,
+    0x40921c8f,
+    0x4092af10,
+    0x40932ff0,
+    0x4093a240,
+    0x40941e8a,
+    0x4094ab66,
+    0x409526d2,
+    0x4095b178,
+    0x40962ebc,
+    0x4096a1a0,
+    0x4097228b,
+    0x4097a0dd,
+    0x40981cef,
+    0x4098a6e6,
+    0x40992f2c,
+    0x4099a3af,
+    0x409a2348,
+    0x409a99e3,
+    0x409b1ee4,
+    0x409b9f0f,
+    0x409c30cd,
+    0x409c9f37,
+    0x409d215c,
+    0x409da12a,
+    0x409e1d80,
+    0x409ea1d5,
+    0x409f21bd,
+    0x409f9ed7,
+    0x40a021fd,
+    0x40a0a0f7,
+    0x40a12145,
+    0x41f42a07,
+    0x41f92a99,
+    0x41fe298c,
+    0x41feac42,
+    0x41ff2d70,
+    0x42032a20,
+    0x42082a42,
+    0x4208aa7e,
+    0x42092970,
+    0x4209aab8,
+    0x420a29c7,
+    0x420aa9a7,
+    0x420b29e7,
+    0x420baa60,
+    0x420c2d8c,
+    0x420cab76,
+    0x420d2c29,
+    0x420dac60,
+    0x42122c93,
+    0x42172d53,
+    0x4217acd5,
+    0x421c2cf7,
+    0x421f2cb2,
+    0x42212e04,
+    0x42262d36,
+    0x422b2de2,
+    0x422bac04,
+    0x422c2dc4,
+    0x422cabb7,
+    0x422d2b90,
+    0x422dada3,
+    0x422e2be3,
+    0x42302d12,
+    0x4230ac7a,
     0x44320778,
     0x44328787,
     0x44330793,
@@ -651,109 +652,109 @@ const uint32_t kOpenSSLReasonValues[] = {
     0x4439084d,
     0x4439885b,
     0x443a086e,
-    0x48321372,
-    0x48329384,
-    0x4833139a,
-    0x483393b3,
-    0x4c3213f0,
-    0x4c329400,
-    0x4c331413,
-    0x4c339433,
+    0x48321385,
+    0x48329397,
+    0x483313ad,
+    0x483393c6,
+    0x4c321403,
+    0x4c329413,
+    0x4c331426,
+    0x4c339446,
     0x4c3400b9,
     0x4c3480f7,
-    0x4c35143f,
-    0x4c35944d,
-    0x4c361469,
-    0x4c36948f,
-    0x4c37149e,
-    0x4c3794ac,
-    0x4c3814c1,
-    0x4c3894cd,
-    0x4c3914ed,
-    0x4c399517,
-    0x4c3a1530,
-    0x4c3a9549,
+    0x4c351452,
+    0x4c359460,
+    0x4c36147c,
+    0x4c3694a2,
+    0x4c3714b1,
+    0x4c3794bf,
+    0x4c3814d4,
+    0x4c3894e0,
+    0x4c391500,
+    0x4c39952a,
+    0x4c3a1543,
+    0x4c3a955c,
     0x4c3b0635,
-    0x4c3b9562,
-    0x4c3c1574,
-    0x4c3c9583,
-    0x4c3d159c,
+    0x4c3b9575,
+    0x4c3c1587,
+    0x4c3c9596,
+    0x4c3d15af,
     0x4c3d8cc6,
-    0x4c3e1609,
-    0x4c3e95ab,
-    0x4c3f162b,
-    0x4c3f9348,
-    0x4c4015c1,
-    0x4c4093dc,
-    0x4c4115f9,
-    0x4c41947c,
-    0x4c4215e5,
-    0x4c4293c4,
-    0x503235ba,
-    0x5032b5c9,
-    0x503335d4,
-    0x5033b5e4,
-    0x503435fd,
-    0x5034b617,
-    0x50353625,
-    0x5035b63b,
-    0x5036364d,
-    0x5036b663,
-    0x5037367c,
-    0x5037b68f,
-    0x503836a7,
-    0x5038b6b8,
-    0x503936cd,
-    0x5039b6e1,
-    0x503a3701,
-    0x503ab717,
-    0x503b372f,
-    0x503bb741,
-    0x503c375d,
-    0x503cb774,
-    0x503d378d,
-    0x503db7a3,
-    0x503e37b0,
-    0x503eb7c6,
-    0x503f37d8,
+    0x4c3e161c,
+    0x4c3e95be,
+    0x4c3f163e,
+    0x4c3f935b,
+    0x4c4015d4,
+    0x4c4093ef,
+    0x4c41160c,
+    0x4c41948f,
+    0x4c4215f8,
+    0x4c4293d7,
+    0x503235cd,
+    0x5032b5dc,
+    0x503335e7,
+    0x5033b5f7,
+    0x50343610,
+    0x5034b62a,
+    0x50353638,
+    0x5035b64e,
+    0x50363660,
+    0x5036b676,
+    0x5037368f,
+    0x5037b6a2,
+    0x503836ba,
+    0x5038b6cb,
+    0x503936e0,
+    0x5039b6f4,
+    0x503a3714,
+    0x503ab72a,
+    0x503b3742,
+    0x503bb754,
+    0x503c3770,
+    0x503cb787,
+    0x503d37a0,
+    0x503db7b6,
+    0x503e37c3,
+    0x503eb7d9,
+    0x503f37eb,
     0x503f83b3,
-    0x504037eb,
-    0x5040b7fb,
-    0x50413815,
-    0x5041b824,
-    0x5042383e,
-    0x5042b85b,
-    0x5043386b,
-    0x5043b87b,
-    0x50443898,
+    0x504037fe,
+    0x5040b80e,
+    0x50413828,
+    0x5041b837,
+    0x50423851,
+    0x5042b86e,
+    0x5043387e,
+    0x5043b88e,
+    0x504438ab,
     0x50448469,
-    0x504538ac,
-    0x5045b8ca,
-    0x504638dd,
-    0x5046b8f3,
-    0x50473905,
-    0x5047b91a,
-    0x50483940,
-    0x5048b94e,
-    0x50493961,
-    0x5049b976,
-    0x504a398c,
-    0x504ab99c,
-    0x504b39bc,
-    0x504bb9cf,
-    0x504c39f2,
-    0x504cba20,
-    0x504d3a4d,
-    0x504dba6a,
-    0x504e3a85,
-    0x504ebaa1,
-    0x504f3ab3,
-    0x504fbaca,
-    0x50503ad9,
+    0x504538bf,
+    0x5045b8dd,
+    0x504638f0,
+    0x5046b906,
+    0x50473918,
+    0x5047b92d,
+    0x50483953,
+    0x5048b961,
+    0x50493974,
+    0x5049b989,
+    0x504a399f,
+    0x504ab9af,
+    0x504b39cf,
+    0x504bb9e2,
+    0x504c3a05,
+    0x504cba33,
+    0x504d3a60,
+    0x504dba7d,
+    0x504e3a98,
+    0x504ebab4,
+    0x504f3ac6,
+    0x504fbadd,
+    0x50503aec,
     0x50508729,
-    0x50513aec,
-    0x5051b88a,
-    0x50523a32,
+    0x50513aff,
+    0x5051b89d,
+    0x50523a45,
     0x58320fd1,
     0x68320f93,
     0x68328ceb,
@@ -795,22 +796,22 @@ const uint32_t kOpenSSLReasonValues[] = {
     0x783d8b97,
     0x783e0aed,
     0x783e8a9f,
-    0x7c321261,
-    0x8032148f,
+    0x7c321274,
+    0x803214a2,
     0x80328090,
-    0x803332b6,
+    0x803332c9,
     0x803380b9,
-    0x803432c5,
-    0x8034b22d,
-    0x8035324b,
-    0x8035b2d9,
-    0x8036328d,
-    0x8036b23c,
-    0x8037327f,
-    0x8037b21a,
-    0x803832a0,
-    0x8038b25c,
-    0x80393271,
+    0x803432d8,
+    0x8034b240,
+    0x8035325e,
+    0x8035b2ec,
+    0x803632a0,
+    0x8036b24f,
+    0x80373292,
+    0x8037b22d,
+    0x803832b3,
+    0x8038b26f,
+    0x80393284,
 };
 
 const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]);
@@ -1034,6 +1035,7 @@ const char kOpenSSLReasonStringData[] =
     "EMPTY_PSK\0"
     "EXPECTING_AN_EC_KEY_KEY\0"
     "EXPECTING_AN_RSA_KEY\0"
+    "EXPECTING_A_DH_KEY\0"
     "EXPECTING_A_DSA_KEY\0"
     "ILLEGAL_OR_UNSUPPORTED_PADDING_MODE\0"
     "INVALID_BUFFER_SIZE\0"
diff --git a/Sources/CCryptoBoringSSL/gen/crypto/md5-586-apple.S b/Sources/CCryptoBoringSSL/gen/crypto/md5-586-apple.S
new file mode 100644
index 00000000..ec0c8472
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/gen/crypto/md5-586-apple.S
@@ -0,0 +1,689 @@
+#define BORINGSSL_PREFIX CCryptoBoringSSL
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <CCryptoBoringSSL_asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl	_md5_block_asm_data_order
+.private_extern	_md5_block_asm_data_order
+.align	4
+_md5_block_asm_data_order:
+L_md5_block_asm_data_order_begin:
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%edi
+	movl	16(%esp),%esi
+	movl	20(%esp),%ecx
+	pushl	%ebp
+	shll	$6,%ecx
+	pushl	%ebx
+	addl	%esi,%ecx
+	subl	$64,%ecx
+	movl	(%edi),%eax
+	pushl	%ecx
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+L000start:
+
+	# R0 section 
+	movl	%ecx,%edi
+	movl	(%esi),%ebp
+	# R0 0 
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	3614090360(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	4(%esi),%ebp
+	addl	%ebx,%eax
+	# R0 1 
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	3905402710(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	8(%esi),%ebp
+	addl	%eax,%edx
+	# R0 2 
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	606105819(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	12(%esi),%ebp
+	addl	%edx,%ecx
+	# R0 3 
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	3250441966(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	16(%esi),%ebp
+	addl	%ecx,%ebx
+	# R0 4 
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	4118548399(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	20(%esi),%ebp
+	addl	%ebx,%eax
+	# R0 5 
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	1200080426(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	24(%esi),%ebp
+	addl	%eax,%edx
+	# R0 6 
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2821735955(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	28(%esi),%ebp
+	addl	%edx,%ecx
+	# R0 7 
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	4249261313(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	32(%esi),%ebp
+	addl	%ecx,%ebx
+	# R0 8 
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1770035416(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	36(%esi),%ebp
+	addl	%ebx,%eax
+	# R0 9 
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	2336552879(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	40(%esi),%ebp
+	addl	%eax,%edx
+	# R0 10 
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	4294925233(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	44(%esi),%ebp
+	addl	%edx,%ecx
+	# R0 11 
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	2304563134(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	48(%esi),%ebp
+	addl	%ecx,%ebx
+	# R0 12 
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1804603682(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	52(%esi),%ebp
+	addl	%ebx,%eax
+	# R0 13 
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	4254626195(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	56(%esi),%ebp
+	addl	%eax,%edx
+	# R0 14 
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2792965006(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	60(%esi),%ebp
+	addl	%edx,%ecx
+	# R0 15 
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	1236535329(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	4(%esi),%ebp
+	addl	%ecx,%ebx
+
+	# R1 section 
+	# R1 16 
+	leal	4129170786(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	24(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+	# R1 17 
+	leal	3225465664(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	44(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+	# R1 18 
+	leal	643717713(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	# R1 19 
+	leal	3921069994(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	# R1 20 
+	leal	3593408605(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	40(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+	# R1 21 
+	leal	38016083(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	60(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+	# R1 22 
+	leal	3634488961(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	16(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	# R1 23 
+	leal	3889429448(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	36(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	# R1 24 
+	leal	568446438(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	56(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+	# R1 25 
+	leal	3275163606(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	12(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+	# R1 26 
+	leal	4107603335(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	32(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	# R1 27 
+	leal	1163531501(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	52(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	# R1 28 
+	leal	2850285829(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	8(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+	# R1 29 
+	leal	4243563512(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	28(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+	# R1 30 
+	leal	1735328473(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	48(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	# R1 31 
+	leal	2368359562(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	# R2 section 
+	# R2 32 
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	4294588738(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	32(%esi),%ebp
+	movl	%ebx,%edi
+	# R2 33 
+	leal	2272392833(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+	# R2 34 
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	1839030562(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	56(%esi),%ebp
+	movl	%edx,%edi
+	# R2 35 
+	leal	4259657740(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+	# R2 36 
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	2763975236(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	16(%esi),%ebp
+	movl	%ebx,%edi
+	# R2 37 
+	leal	1272893353(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+	# R2 38 
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	4139469664(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	40(%esi),%ebp
+	movl	%edx,%edi
+	# R2 39 
+	leal	3200236656(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+	# R2 40 
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	681279174(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	(%esi),%ebp
+	movl	%ebx,%edi
+	# R2 41 
+	leal	3936430074(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+	# R2 42 
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	3572445317(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	24(%esi),%ebp
+	movl	%edx,%edi
+	# R2 43 
+	leal	76029189(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+	# R2 44 
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	3654602809(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	48(%esi),%ebp
+	movl	%ebx,%edi
+	# R2 45 
+	leal	3873151461(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+	# R2 46 
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	530742520(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	8(%esi),%ebp
+	movl	%edx,%edi
+	# R2 47 
+	leal	3299628645(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	# R3 section 
+	# R3 48 
+	xorl	%edx,%edi
+	orl	%ebx,%edi
+	leal	4096336452(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+	# R3 49 
+	orl	%eax,%edi
+	leal	1126891415(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	56(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+	# R3 50 
+	orl	%edx,%edi
+	leal	2878612391(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	20(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	# R3 51 
+	orl	%ecx,%edi
+	leal	4237533241(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	48(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+	# R3 52 
+	orl	%ebx,%edi
+	leal	1700485571(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+	# R3 53 
+	orl	%eax,%edi
+	leal	2399980690(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	40(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+	# R3 54 
+	orl	%edx,%edi
+	leal	4293915773(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	# R3 55 
+	orl	%ecx,%edi
+	leal	2240044497(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	32(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+	# R3 56 
+	orl	%ebx,%edi
+	leal	1873313359(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+	# R3 57 
+	orl	%eax,%edi
+	leal	4264355552(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	24(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+	# R3 58 
+	orl	%edx,%edi
+	leal	2734768916(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	# R3 59 
+	orl	%ecx,%edi
+	leal	1309151649(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	16(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+	# R3 60 
+	orl	%ebx,%edi
+	leal	4149444226(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+	# R3 61 
+	orl	%eax,%edi
+	leal	3174756917(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	8(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+	# R3 62 
+	orl	%edx,%edi
+	leal	718787259(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	# R3 63 
+	orl	%ecx,%edi
+	leal	3951481745(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	24(%esp),%ebp
+	addl	%edi,%ebx
+	addl	$64,%esi
+	roll	$21,%ebx
+	movl	(%ebp),%edi
+	addl	%ecx,%ebx
+	addl	%edi,%eax
+	movl	4(%ebp),%edi
+	addl	%edi,%ebx
+	movl	8(%ebp),%edi
+	addl	%edi,%ecx
+	movl	12(%ebp),%edi
+	addl	%edi,%edx
+	movl	%eax,(%ebp)
+	movl	%ebx,4(%ebp)
+	movl	(%esp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	cmpl	%esi,%edi
+	jae	L000start
+	popl	%eax
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-linux.linux.x86.S b/Sources/CCryptoBoringSSL/gen/crypto/md5-586-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-linux.linux.x86.S
rename to Sources/CCryptoBoringSSL/gen/crypto/md5-586-linux.S
index 6de63f48..ed1ac320 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-586-linux.linux.x86.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/md5-586-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__i386__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -686,7 +685,6 @@ md5_block_asm_data_order:
 	ret
 .size	md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
-#endif  // defined(__i386__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-mac.mac.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-apple.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-mac.mac.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-apple.S
index 44871ebd..e5e052dd 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-mac.mac.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-apple.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__APPLE__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -690,7 +689,6 @@ L$epilogue:
 
 
 #endif
-#endif  // defined(__x86_64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-linux.linux.x86_64.S b/Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-linux.S
similarity index 99%
rename from Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-linux.linux.x86_64.S
rename to Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-linux.S
index f5556018..ffb7fb13 100644
--- a/Sources/CCryptoBoringSSL/crypto/fipsmodule/md5-x86_64-linux.linux.x86_64.S
+++ b/Sources/CCryptoBoringSSL/gen/crypto/md5-x86_64-linux.S
@@ -1,5 +1,4 @@
 #define BORINGSSL_PREFIX CCryptoBoringSSL
-#if defined(__x86_64__) && defined(__linux__)
 // This file is generated from a similarly-named Perl script in the BoringSSL
 // source tree. Do not edit by hand.
 
@@ -695,7 +694,6 @@ _CET_ENDBR
 .cfi_endproc	
 .size	md5_block_asm_data_order,.-md5_block_asm_data_order
 #endif
-#endif  // defined(__x86_64__) && defined(__linux__)
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/Sources/CCryptoBoringSSL/hash.txt b/Sources/CCryptoBoringSSL/hash.txt
index 74b3edbd..64113321 100644
--- a/Sources/CCryptoBoringSSL/hash.txt
+++ b/Sources/CCryptoBoringSSL/hash.txt
@@ -1 +1 @@
-This directory is derived from BoringSSL cloned from https://boringssl.googlesource.com/boringssl at revision dbad745811195c00b729efd0ee0a09b7d9fce1d2
+This directory is derived from BoringSSL cloned from https://boringssl.googlesource.com/boringssl at revision 6a2ccdcc2ed1d37a43a2183658d2ae61fd5ce208
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL.h
index ac689525..60637c3c 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL.h
@@ -14,6 +14,7 @@
 #ifndef C_CRYPTO_BORINGSSL_H
 #define C_CRYPTO_BORINGSSL_H
 
+#include "CCryptoBoringSSL_aead.h"
 #include "CCryptoBoringSSL_aes.h"
 #include "CCryptoBoringSSL_arm_arch.h"
 #include "CCryptoBoringSSL_asn1_mac.h"
@@ -22,6 +23,7 @@
 #include "CCryptoBoringSSL_bio.h"
 #include "CCryptoBoringSSL_blake2.h"
 #include "CCryptoBoringSSL_blowfish.h"
+#include "CCryptoBoringSSL_bn.h"
 #include "CCryptoBoringSSL_boringssl_prefix_symbols.h"
 #include "CCryptoBoringSSL_boringssl_prefix_symbols_asm.h"
 #include "CCryptoBoringSSL_cast.h"
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_asn1.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_asn1.h
index dbf5b5b8..09ae0156 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_asn1.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_asn1.h
@@ -468,7 +468,8 @@ DECLARE_ASN1_ITEM(ASN1_FBOOLEAN)
 
 // An asn1_string_st (aka |ASN1_STRING|) represents a value of a string-like
 // ASN.1 type. It contains a |type| field, and a byte string |data| field with a
-// type-specific representation.
+// type-specific representation. This type-specific representation does not
+// always correspond to the DER encoding of the type.
 //
 // If |type| is one of |V_ASN1_OCTET_STRING|, |V_ASN1_UTF8STRING|,
 // |V_ASN1_NUMERICSTRING|, |V_ASN1_PRINTABLESTRING|, |V_ASN1_T61STRING|,
@@ -568,6 +569,10 @@ OPENSSL_EXPORT int ASN1_STRING_type(const ASN1_STRING *str);
 // ASN1_STRING_get0_data returns a pointer to |str|'s contents. Callers should
 // use |ASN1_STRING_length| to determine the length of the string. The string
 // may have embedded NUL bytes and may not be NUL-terminated.
+//
+// The contents of an |ASN1_STRING| encode the value in some type-specific
+// representation that does not always correspond to the DER encoding of the
+// type. See the documentation for |ASN1_STRING| for details.
 OPENSSL_EXPORT const unsigned char *ASN1_STRING_get0_data(
     const ASN1_STRING *str);
 
@@ -575,10 +580,18 @@ OPENSSL_EXPORT const unsigned char *ASN1_STRING_get0_data(
 // should use |ASN1_STRING_length| to determine the length of the string. The
 // string may have embedded NUL bytes and may not be NUL-terminated.
 //
+// The contents of an |ASN1_STRING| encode the value in some type-specific
+// representation that does not always correspond to the DER encoding of the
+// type. See the documentation for |ASN1_STRING| for details.
+//
 // Prefer |ASN1_STRING_get0_data|.
 OPENSSL_EXPORT unsigned char *ASN1_STRING_data(ASN1_STRING *str);
 
 // ASN1_STRING_length returns the length of |str|, in bytes.
+//
+// The contents of an |ASN1_STRING| encode the value in some type-specific
+// representation that does not always correspond to the DER encoding of the
+// type. See the documentation for |ASN1_STRING| for details.
 OPENSSL_EXPORT int ASN1_STRING_length(const ASN1_STRING *str);
 
 // ASN1_STRING_cmp compares |a| and |b|'s type and contents. It returns an
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_base.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_base.h
index bf897fd5..1a02695d 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_base.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_base.h
@@ -189,6 +189,13 @@ extern "C" {
 #define OPENSSL_PRINTF_FORMAT_FUNC(string_index, first_to_check)
 #endif
 
+// OPENSSL_CLANG_PRAGMA emits a pragma on clang and nothing on other compilers.
+#if defined(__clang__)
+#define OPENSSL_CLANG_PRAGMA(arg) _Pragma(arg)
+#else
+#define OPENSSL_CLANG_PRAGMA(arg)
+#endif
+
 // OPENSSL_MSVC_PRAGMA emits a pragma on MSVC and nothing on other compilers.
 #if defined(_MSC_VER)
 #define OPENSSL_MSVC_PRAGMA(arg) __pragma(arg)
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bcm_public.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bcm_public.h
new file mode 100644
index 00000000..89c45765
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bcm_public.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2024, Google LLC
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_BCM_PUBLIC_H_
+#define OPENSSL_HEADER_BCM_PUBLIC_H_
+
+#include "CCryptoBoringSSL_base.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Public types referenced by BoringCrypto
+//
+// This header contains public types referenced by BCM. Such types are difficult
+// to hide from the libcrypto interface, so we treat them as part of BCM.
+
+// BCM_SHA_CBLOCK is the block size of SHA-1.
+#define BCM_SHA_CBLOCK 64
+
+// SHA_CTX
+struct sha_state_st {
+#if defined(__cplusplus) || defined(OPENSSL_WINDOWS)
+  uint32_t h[5];
+#else
+  // wpa_supplicant accesses |h0|..|h4| so we must support those names for
+  // compatibility with it until it can be updated. Anonymous unions are only
+  // standard in C11, so disable this workaround in C++.
+  union {
+    uint32_t h[5];
+    struct {
+      uint32_t h0;
+      uint32_t h1;
+      uint32_t h2;
+      uint32_t h3;
+      uint32_t h4;
+    };
+  };
+#endif
+  uint32_t Nl, Nh;
+  uint8_t data[BCM_SHA_CBLOCK];
+  unsigned num;
+};
+
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
+
+#endif  // OPENSSL_HEADER_BCM_PUBLIC_H_
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bio.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bio.h
index 839bf205..0e6bab06 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bio.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bio.h
@@ -714,33 +714,35 @@ OPENSSL_EXPORT void BIO_meth_free(BIO_METHOD *method);
 // and returns one. The function should return one on success and zero on
 // error.
 OPENSSL_EXPORT int BIO_meth_set_create(BIO_METHOD *method,
-                                       int (*create)(BIO *));
+                                       int (*create_func)(BIO *));
 
 // BIO_meth_set_destroy sets a function to release data associated with a |BIO|
 // and returns one. The function's return value is ignored.
 OPENSSL_EXPORT int BIO_meth_set_destroy(BIO_METHOD *method,
-                                        int (*destroy)(BIO *));
+                                        int (*destroy_func)(BIO *));
 
 // BIO_meth_set_write sets the implementation of |BIO_write| for |method| and
 // returns one. |BIO_METHOD|s which implement |BIO_write| should also implement
 // |BIO_CTRL_FLUSH|. (See |BIO_meth_set_ctrl|.)
 OPENSSL_EXPORT int BIO_meth_set_write(BIO_METHOD *method,
-                                      int (*write)(BIO *, const char *, int));
+                                      int (*write_func)(BIO *, const char *,
+                                                        int));
 
 // BIO_meth_set_read sets the implementation of |BIO_read| for |method| and
 // returns one.
 OPENSSL_EXPORT int BIO_meth_set_read(BIO_METHOD *method,
-                                     int (*read)(BIO *, char *, int));
+                                     int (*read_func)(BIO *, char *, int));
 
 // BIO_meth_set_gets sets the implementation of |BIO_gets| for |method| and
 // returns one.
 OPENSSL_EXPORT int BIO_meth_set_gets(BIO_METHOD *method,
-                                     int (*gets)(BIO *, char *, int));
+                                     int (*gets_func)(BIO *, char *, int));
 
 // BIO_meth_set_ctrl sets the implementation of |BIO_ctrl| for |method| and
 // returns one.
 OPENSSL_EXPORT int BIO_meth_set_ctrl(BIO_METHOD *method,
-                                     long (*ctrl)(BIO *, int, long, void *));
+                                     long (*ctrl_func)(BIO *, int, long,
+                                                       void *));
 
 // BIO_set_data sets custom data on |bio|. It may be retried with
 // |BIO_get_data|.
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bn.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bn.h
index a93aaa5e..013bc10e 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bn.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bn.h
@@ -388,9 +388,9 @@ OPENSSL_EXPORT void BN_CTX_end(BN_CTX *ctx);
 // or |b|. It returns one on success and zero on allocation failure.
 OPENSSL_EXPORT int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
 
-// BN_uadd sets |r| = |a| + |b|, where |a| and |b| are non-negative and |r| may
-// be the same pointer as either |a| or |b|. It returns one on success and zero
-// on allocation failure.
+// BN_uadd sets |r| = |a| + |b|, considering only the absolute values of |a| and
+// |b|. |r| may be the same pointer as either |a| or |b|. It returns one on
+// success and zero on allocation failure.
 OPENSSL_EXPORT int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
 
 // BN_add_word adds |w| to |a|. It returns one on success and zero otherwise.
@@ -400,9 +400,9 @@ OPENSSL_EXPORT int BN_add_word(BIGNUM *a, BN_ULONG w);
 // or |b|. It returns one on success and zero on allocation failure.
 OPENSSL_EXPORT int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
 
-// BN_usub sets |r| = |a| - |b|, where |a| and |b| are non-negative integers,
-// |b| < |a| and |r| may be the same pointer as either |a| or |b|. It returns
-// one on success and zero on allocation failure.
+// BN_usub sets |r| = |a| - |b|, considering only the absolute values of |a| and
+// |b|. The result must be non-negative, i.e. |b| <= |a|. |r| may be the same
+// pointer as either |a| or |b|. It returns one on success and zero on error.
 OPENSSL_EXPORT int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
 
 // BN_sub_word subtracts |w| from |a|. It returns one on success and zero on
@@ -425,9 +425,14 @@ OPENSSL_EXPORT int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
 
 // BN_div divides |numerator| by |divisor| and places the result in |quotient|
 // and the remainder in |rem|. Either of |quotient| or |rem| may be NULL, in
-// which case the respective value is not returned. The result is rounded
-// towards zero; thus if |numerator| is negative, the remainder will be zero or
-// negative. It returns one on success or zero on error.
+// which case the respective value is not returned. It returns one on success or
+// zero on error. It is an error condition if |divisor| is zero.
+//
+// The outputs will be such that |quotient| * |divisor| + |rem| = |numerator|,
+// with the quotient rounded towards zero. Thus, if |numerator| is negative,
+// |rem| will be zero or negative. If |divisor| is negative, the sign of
+// |quotient| will be flipped to compensate but otherwise rounding will be as if
+// |divisor| were its absolute value.
 OPENSSL_EXPORT int BN_div(BIGNUM *quotient, BIGNUM *rem,
                           const BIGNUM *numerator, const BIGNUM *divisor,
                           BN_CTX *ctx);
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols.h
index 5cb6e422..7ec8370c 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols.h
@@ -210,6 +210,14 @@
 #define BASIC_CONSTRAINTS_free BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_free)
 #define BASIC_CONSTRAINTS_it BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_it)
 #define BASIC_CONSTRAINTS_new BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_new)
+#define BCM_fips_186_2_prf BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_fips_186_2_prf)
+#define BCM_rand_bytes BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_rand_bytes)
+#define BCM_rand_bytes_hwrng BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_rand_bytes_hwrng)
+#define BCM_rand_bytes_with_additional_data BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_rand_bytes_with_additional_data)
+#define BCM_sha1_final BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_sha1_final)
+#define BCM_sha1_init BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_sha1_init)
+#define BCM_sha1_transform BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_sha1_transform)
+#define BCM_sha1_update BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BCM_sha1_update)
 #define BIO_append_filename BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BIO_append_filename)
 #define BIO_callback_ctrl BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BIO_callback_ctrl)
 #define BIO_clear_flags BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, BIO_clear_flags)
@@ -716,6 +724,16 @@
 #define DH_size BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DH_size)
 #define DH_up_ref BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DH_up_ref)
 #define DHparams_dup BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DHparams_dup)
+#define DILITHIUM_generate_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_generate_key)
+#define DILITHIUM_generate_key_external_entropy BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_generate_key_external_entropy)
+#define DILITHIUM_marshal_private_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_marshal_private_key)
+#define DILITHIUM_marshal_public_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_marshal_public_key)
+#define DILITHIUM_parse_private_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_parse_private_key)
+#define DILITHIUM_parse_public_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_parse_public_key)
+#define DILITHIUM_public_from_private BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_public_from_private)
+#define DILITHIUM_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_sign)
+#define DILITHIUM_sign_deterministic BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_sign_deterministic)
+#define DILITHIUM_verify BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DILITHIUM_verify)
 #define DIRECTORYSTRING_free BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DIRECTORYSTRING_free)
 #define DIRECTORYSTRING_it BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DIRECTORYSTRING_it)
 #define DIRECTORYSTRING_new BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, DIRECTORYSTRING_new)
@@ -1097,6 +1115,7 @@
 #define EVP_PKEY_CTX_set0_rsa_oaep_label BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set0_rsa_oaep_label)
 #define EVP_PKEY_CTX_set1_hkdf_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set1_hkdf_key)
 #define EVP_PKEY_CTX_set1_hkdf_salt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set1_hkdf_salt)
+#define EVP_PKEY_CTX_set_dh_pad BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dh_pad)
 #define EVP_PKEY_CTX_set_dsa_paramgen_bits BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dsa_paramgen_bits)
 #define EVP_PKEY_CTX_set_dsa_paramgen_q_bits BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dsa_paramgen_q_bits)
 #define EVP_PKEY_CTX_set_ec_param_enc BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_ec_param_enc)
@@ -1113,6 +1132,7 @@
 #define EVP_PKEY_CTX_set_rsa_pss_saltlen BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_rsa_pss_saltlen)
 #define EVP_PKEY_CTX_set_signature_md BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_signature_md)
 #define EVP_PKEY_assign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_assign)
+#define EVP_PKEY_assign_DH BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_assign_DH)
 #define EVP_PKEY_assign_DSA BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_assign_DSA)
 #define EVP_PKEY_assign_EC_KEY BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_assign_EC_KEY)
 #define EVP_PKEY_assign_RSA BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_assign_RSA)
@@ -1154,6 +1174,7 @@
 #define EVP_PKEY_print_params BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_print_params)
 #define EVP_PKEY_print_private BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_print_private)
 #define EVP_PKEY_print_public BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_print_public)
+#define EVP_PKEY_set1_DH BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_set1_DH)
 #define EVP_PKEY_set1_DSA BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_set1_DSA)
 #define EVP_PKEY_set1_EC_KEY BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_set1_EC_KEY)
 #define EVP_PKEY_set1_RSA BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_PKEY_set1_RSA)
@@ -1238,6 +1259,7 @@
 #define EVP_hpke_aes_256_gcm BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_hpke_aes_256_gcm)
 #define EVP_hpke_chacha20_poly1305 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_hpke_chacha20_poly1305)
 #define EVP_hpke_hkdf_sha256 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_hpke_hkdf_sha256)
+#define EVP_hpke_p256_hkdf_sha256 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_hpke_p256_hkdf_sha256)
 #define EVP_hpke_x25519_hkdf_sha256 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_hpke_x25519_hkdf_sha256)
 #define EVP_marshal_digest_algorithm BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_marshal_digest_algorithm)
 #define EVP_marshal_private_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, EVP_marshal_private_key)
@@ -1340,6 +1362,18 @@
 #define MD5_Update BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MD5_Update)
 #define METHOD_ref BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, METHOD_ref)
 #define METHOD_unref BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, METHOD_unref)
+#define MLDSA65_generate_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_generate_key)
+#define MLDSA65_generate_key_external_entropy BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_generate_key_external_entropy)
+#define MLDSA65_marshal_private_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_marshal_private_key)
+#define MLDSA65_marshal_public_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_marshal_public_key)
+#define MLDSA65_parse_private_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_parse_private_key)
+#define MLDSA65_parse_public_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_parse_public_key)
+#define MLDSA65_private_key_from_seed BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_private_key_from_seed)
+#define MLDSA65_public_from_private BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_public_from_private)
+#define MLDSA65_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_sign)
+#define MLDSA65_sign_internal BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_sign_internal)
+#define MLDSA65_verify BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_verify)
+#define MLDSA65_verify_internal BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, MLDSA65_verify_internal)
 #define NAME_CONSTRAINTS_check BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NAME_CONSTRAINTS_check)
 #define NAME_CONSTRAINTS_free BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NAME_CONSTRAINTS_free)
 #define NAME_CONSTRAINTS_it BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NAME_CONSTRAINTS_it)
@@ -1365,6 +1399,8 @@
 #define NOTICEREF_free BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NOTICEREF_free)
 #define NOTICEREF_it BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NOTICEREF_it)
 #define NOTICEREF_new BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, NOTICEREF_new)
+#define OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER)
+#define OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER)
 #define OBJ_cbs2nid BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OBJ_cbs2nid)
 #define OBJ_cleanup BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OBJ_cleanup)
 #define OBJ_cmp BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OBJ_cmp)
@@ -1404,6 +1440,7 @@
 #define OPENSSL_gmtime_diff BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_gmtime_diff)
 #define OPENSSL_hash32 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_hash32)
 #define OPENSSL_ia32cap_P BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_ia32cap_P)
+#define OPENSSL_init_cpuid BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_init_cpuid)
 #define OPENSSL_init_crypto BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_init_crypto)
 #define OPENSSL_isalnum BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_isalnum)
 #define OPENSSL_isalpha BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, OPENSSL_isalpha)
@@ -1613,7 +1650,6 @@
 #define RAND_SSLeay BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_SSLeay)
 #define RAND_add BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_add)
 #define RAND_bytes BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_bytes)
-#define RAND_bytes_with_additional_data BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_bytes_with_additional_data)
 #define RAND_cleanup BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_cleanup)
 #define RAND_disable_fork_unsafe_buffering BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_disable_fork_unsafe_buffering)
 #define RAND_egd BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, RAND_egd)
@@ -1737,6 +1773,10 @@
 #define SPAKE2_CTX_new BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPAKE2_CTX_new)
 #define SPAKE2_generate_msg BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPAKE2_generate_msg)
 #define SPAKE2_process_msg BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPAKE2_process_msg)
+#define SPX_generate_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPX_generate_key)
+#define SPX_generate_key_from_seed BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPX_generate_key_from_seed)
+#define SPX_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPX_sign)
+#define SPX_verify BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SPX_verify)
 #define SSLeay BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SSLeay)
 #define SSLeay_version BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, SSLeay_version)
 #define TRUST_TOKEN_CLIENT_add_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, TRUST_TOKEN_CLIENT_add_key)
@@ -2065,11 +2105,9 @@
 #define X509_STORE_load_locations BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_load_locations)
 #define X509_STORE_new BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_new)
 #define X509_STORE_set1_param BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set1_param)
-#define X509_STORE_set_check_crl BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_check_crl)
 #define X509_STORE_set_default_paths BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_default_paths)
 #define X509_STORE_set_depth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_depth)
 #define X509_STORE_set_flags BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_flags)
-#define X509_STORE_set_get_crl BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_get_crl)
 #define X509_STORE_set_purpose BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_purpose)
 #define X509_STORE_set_trust BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_trust)
 #define X509_STORE_set_verify_cb BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, X509_STORE_set_verify_cb)
@@ -2249,8 +2287,11 @@
 #define aes_hw_decrypt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_decrypt)
 #define aes_hw_ecb_encrypt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_ecb_encrypt)
 #define aes_hw_encrypt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_encrypt)
+#define aes_hw_encrypt_key_to_decrypt_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_encrypt_key_to_decrypt_key)
 #define aes_hw_set_decrypt_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_set_decrypt_key)
 #define aes_hw_set_encrypt_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_set_encrypt_key)
+#define aes_hw_set_encrypt_key_alt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_set_encrypt_key_alt)
+#define aes_hw_set_encrypt_key_base BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_hw_set_encrypt_key_base)
 #define aes_nohw_cbc_encrypt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_nohw_cbc_encrypt)
 #define aes_nohw_ctr32_encrypt_blocks BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_nohw_ctr32_encrypt_blocks)
 #define aes_nohw_decrypt BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, aes_nohw_decrypt)
@@ -2322,19 +2363,22 @@
 #define bn_mont_ctx_set_RR_consttime BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mont_ctx_set_RR_consttime)
 #define bn_mont_n0 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mont_n0)
 #define bn_mul4x_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul4x_mont)
+#define bn_mul4x_mont_gather5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul4x_mont_gather5)
 #define bn_mul_add_words BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_add_words)
 #define bn_mul_comba4 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_comba4)
 #define bn_mul_comba8 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_comba8)
 #define bn_mul_consttime BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_consttime)
 #define bn_mul_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_mont)
-#define bn_mul_mont_gather5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_mont_gather5)
+#define bn_mul_mont_gather5_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_mont_gather5_nohw)
 #define bn_mul_mont_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_mont_nohw)
 #define bn_mul_small BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_small)
 #define bn_mul_words BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mul_words)
 #define bn_mulx4x_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mulx4x_mont)
+#define bn_mulx4x_mont_gather5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_mulx4x_mont_gather5)
 #define bn_odd_number_is_obviously_composite BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_odd_number_is_obviously_composite)
 #define bn_one_to_montgomery BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_one_to_montgomery)
-#define bn_power5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_power5)
+#define bn_power5_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_power5_nohw)
+#define bn_powerx5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_powerx5)
 #define bn_rand_range_words BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_rand_range_words)
 #define bn_rand_secret_range BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_rand_secret_range)
 #define bn_reduce_once BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, bn_reduce_once)
@@ -2369,7 +2413,11 @@
 #define c2i_ASN1_INTEGER BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, c2i_ASN1_INTEGER)
 #define c2i_ASN1_OBJECT BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, c2i_ASN1_OBJECT)
 #define chacha20_poly1305_open BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_open)
+#define chacha20_poly1305_open_avx2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_open_avx2)
+#define chacha20_poly1305_open_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_open_nohw)
 #define chacha20_poly1305_seal BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_seal)
+#define chacha20_poly1305_seal_avx2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_seal_avx2)
+#define chacha20_poly1305_seal_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, chacha20_poly1305_seal_nohw)
 #define crypto_gcm_clmul_enabled BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, crypto_gcm_clmul_enabled)
 #define d2i_ASN1_BIT_STRING BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, d2i_ASN1_BIT_STRING)
 #define d2i_ASN1_BMPSTRING BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, d2i_ASN1_BMPSTRING)
@@ -2478,8 +2526,10 @@
 #define d2i_X509_VAL BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, d2i_X509_VAL)
 #define d2i_X509_bio BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, d2i_X509_bio)
 #define d2i_X509_fp BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, d2i_X509_fp)
+#define dh_asn1_meth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dh_asn1_meth)
 #define dh_check_params_fast BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dh_check_params_fast)
 #define dh_compute_key_padded_no_self_test BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dh_compute_key_padded_no_self_test)
+#define dh_pkey_meth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dh_pkey_meth)
 #define dsa_asn1_meth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dsa_asn1_meth)
 #define dsa_check_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, dsa_check_key)
 #define ec_GFp_mont_add BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ec_GFp_mont_add)
@@ -2569,25 +2619,46 @@
 #define ec_set_to_safe_point BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ec_set_to_safe_point)
 #define ec_simple_scalar_inv0_montgomery BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ec_simple_scalar_inv0_montgomery)
 #define ec_simple_scalar_to_montgomery_inv_vartime BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ec_simple_scalar_to_montgomery_inv_vartime)
-#define ecdsa_do_verify_no_self_test BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_do_verify_no_self_test)
-#define ecdsa_sign_with_nonce_for_known_answer_test BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_sign_with_nonce_for_known_answer_test)
-#define ecp_nistz256_avx2_select_w7 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_avx2_select_w7)
+#define ecdsa_sign_fixed BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_sign_fixed)
+#define ecdsa_sign_fixed_with_nonce_for_known_answer_test BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_sign_fixed_with_nonce_for_known_answer_test)
+#define ecdsa_verify_fixed BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_verify_fixed)
+#define ecdsa_verify_fixed_no_self_test BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecdsa_verify_fixed_no_self_test)
 #define ecp_nistz256_div_by_2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_div_by_2)
 #define ecp_nistz256_mul_by_2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_mul_by_2)
 #define ecp_nistz256_mul_by_3 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_mul_by_3)
 #define ecp_nistz256_mul_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_mul_mont)
+#define ecp_nistz256_mul_mont_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_mul_mont_adx)
+#define ecp_nistz256_mul_mont_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_mul_mont_nohw)
 #define ecp_nistz256_neg BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_neg)
 #define ecp_nistz256_ord_mul_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont)
+#define ecp_nistz256_ord_mul_mont_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont_adx)
+#define ecp_nistz256_ord_mul_mont_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont_nohw)
 #define ecp_nistz256_ord_sqr_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont)
+#define ecp_nistz256_ord_sqr_mont_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont_adx)
+#define ecp_nistz256_ord_sqr_mont_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont_nohw)
 #define ecp_nistz256_point_add BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add)
+#define ecp_nistz256_point_add_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add_adx)
 #define ecp_nistz256_point_add_affine BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine)
+#define ecp_nistz256_point_add_affine_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine_adx)
+#define ecp_nistz256_point_add_affine_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine_nohw)
+#define ecp_nistz256_point_add_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_add_nohw)
 #define ecp_nistz256_point_double BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_double)
+#define ecp_nistz256_point_double_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_double_adx)
+#define ecp_nistz256_point_double_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_point_double_nohw)
 #define ecp_nistz256_select_w5 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w5)
+#define ecp_nistz256_select_w5_avx2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w5_avx2)
+#define ecp_nistz256_select_w5_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w5_nohw)
 #define ecp_nistz256_select_w7 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w7)
+#define ecp_nistz256_select_w7_avx2 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w7_avx2)
+#define ecp_nistz256_select_w7_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_select_w7_nohw)
 #define ecp_nistz256_sqr_mont BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont)
+#define ecp_nistz256_sqr_mont_adx BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont_adx)
+#define ecp_nistz256_sqr_mont_nohw BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont_nohw)
 #define ecp_nistz256_sub BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ecp_nistz256_sub)
 #define ed25519_asn1_meth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ed25519_asn1_meth)
 #define ed25519_pkey_meth BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, ed25519_pkey_meth)
+#define evp_md_md5_sha1 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, evp_md_md5_sha1)
+#define evp_pkey_set_method BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, evp_pkey_set_method)
 #define fiat_curve25519_adx_mul BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, fiat_curve25519_adx_mul)
 #define fiat_curve25519_adx_square BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, fiat_curve25519_adx_square)
 #define fiat_p256_adx_mul BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, fiat_p256_adx_mul)
@@ -2829,8 +2900,6 @@
 #define spx_fors_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_fors_sign)
 #define spx_fors_sk_gen BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_fors_sk_gen)
 #define spx_fors_treehash BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_fors_treehash)
-#define spx_generate_key BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_generate_key)
-#define spx_generate_key_from_seed BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_generate_key_from_seed)
 #define spx_get_tree_index BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_get_tree_index)
 #define spx_ht_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_ht_sign)
 #define spx_ht_verify BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_ht_verify)
@@ -2842,7 +2911,6 @@
 #define spx_set_tree_height BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_set_tree_height)
 #define spx_set_tree_index BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_set_tree_index)
 #define spx_set_type BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_set_type)
-#define spx_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_sign)
 #define spx_thash_f BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_thash_f)
 #define spx_thash_h BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_thash_h)
 #define spx_thash_hmsg BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_thash_hmsg)
@@ -2853,12 +2921,12 @@
 #define spx_to_uint64 BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_to_uint64)
 #define spx_treehash BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_treehash)
 #define spx_uint64_to_len_bytes BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_uint64_to_len_bytes)
-#define spx_verify BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_verify)
 #define spx_wots_pk_from_sig BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_wots_pk_from_sig)
 #define spx_wots_pk_gen BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_wots_pk_gen)
 #define spx_wots_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_wots_sign)
 #define spx_xmss_pk_from_sig BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_xmss_pk_from_sig)
 #define spx_xmss_sign BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, spx_xmss_sign)
+#define swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE)
 #define v2i_GENERAL_NAME BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, v2i_GENERAL_NAME)
 #define v2i_GENERAL_NAMES BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, v2i_GENERAL_NAMES)
 #define v2i_GENERAL_NAME_ex BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, v2i_GENERAL_NAME_ex)
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols_asm.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols_asm.h
index 3f18f9d2..2d0fad13 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols_asm.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_boringssl_prefix_symbols_asm.h
@@ -215,6 +215,14 @@
 #define _BASIC_CONSTRAINTS_free BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_free)
 #define _BASIC_CONSTRAINTS_it BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_it)
 #define _BASIC_CONSTRAINTS_new BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BASIC_CONSTRAINTS_new)
+#define _BCM_fips_186_2_prf BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_fips_186_2_prf)
+#define _BCM_rand_bytes BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_rand_bytes)
+#define _BCM_rand_bytes_hwrng BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_rand_bytes_hwrng)
+#define _BCM_rand_bytes_with_additional_data BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_rand_bytes_with_additional_data)
+#define _BCM_sha1_final BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_sha1_final)
+#define _BCM_sha1_init BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_sha1_init)
+#define _BCM_sha1_transform BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_sha1_transform)
+#define _BCM_sha1_update BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BCM_sha1_update)
 #define _BIO_append_filename BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BIO_append_filename)
 #define _BIO_callback_ctrl BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BIO_callback_ctrl)
 #define _BIO_clear_flags BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, BIO_clear_flags)
@@ -721,6 +729,16 @@
 #define _DH_size BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DH_size)
 #define _DH_up_ref BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DH_up_ref)
 #define _DHparams_dup BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DHparams_dup)
+#define _DILITHIUM_generate_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_generate_key)
+#define _DILITHIUM_generate_key_external_entropy BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_generate_key_external_entropy)
+#define _DILITHIUM_marshal_private_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_marshal_private_key)
+#define _DILITHIUM_marshal_public_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_marshal_public_key)
+#define _DILITHIUM_parse_private_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_parse_private_key)
+#define _DILITHIUM_parse_public_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_parse_public_key)
+#define _DILITHIUM_public_from_private BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_public_from_private)
+#define _DILITHIUM_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_sign)
+#define _DILITHIUM_sign_deterministic BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_sign_deterministic)
+#define _DILITHIUM_verify BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DILITHIUM_verify)
 #define _DIRECTORYSTRING_free BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DIRECTORYSTRING_free)
 #define _DIRECTORYSTRING_it BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DIRECTORYSTRING_it)
 #define _DIRECTORYSTRING_new BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, DIRECTORYSTRING_new)
@@ -1102,6 +1120,7 @@
 #define _EVP_PKEY_CTX_set0_rsa_oaep_label BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set0_rsa_oaep_label)
 #define _EVP_PKEY_CTX_set1_hkdf_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set1_hkdf_key)
 #define _EVP_PKEY_CTX_set1_hkdf_salt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set1_hkdf_salt)
+#define _EVP_PKEY_CTX_set_dh_pad BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dh_pad)
 #define _EVP_PKEY_CTX_set_dsa_paramgen_bits BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dsa_paramgen_bits)
 #define _EVP_PKEY_CTX_set_dsa_paramgen_q_bits BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_dsa_paramgen_q_bits)
 #define _EVP_PKEY_CTX_set_ec_param_enc BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_ec_param_enc)
@@ -1118,6 +1137,7 @@
 #define _EVP_PKEY_CTX_set_rsa_pss_saltlen BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_rsa_pss_saltlen)
 #define _EVP_PKEY_CTX_set_signature_md BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_CTX_set_signature_md)
 #define _EVP_PKEY_assign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_assign)
+#define _EVP_PKEY_assign_DH BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_assign_DH)
 #define _EVP_PKEY_assign_DSA BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_assign_DSA)
 #define _EVP_PKEY_assign_EC_KEY BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_assign_EC_KEY)
 #define _EVP_PKEY_assign_RSA BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_assign_RSA)
@@ -1159,6 +1179,7 @@
 #define _EVP_PKEY_print_params BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_print_params)
 #define _EVP_PKEY_print_private BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_print_private)
 #define _EVP_PKEY_print_public BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_print_public)
+#define _EVP_PKEY_set1_DH BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_set1_DH)
 #define _EVP_PKEY_set1_DSA BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_set1_DSA)
 #define _EVP_PKEY_set1_EC_KEY BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_set1_EC_KEY)
 #define _EVP_PKEY_set1_RSA BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_PKEY_set1_RSA)
@@ -1243,6 +1264,7 @@
 #define _EVP_hpke_aes_256_gcm BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_hpke_aes_256_gcm)
 #define _EVP_hpke_chacha20_poly1305 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_hpke_chacha20_poly1305)
 #define _EVP_hpke_hkdf_sha256 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_hpke_hkdf_sha256)
+#define _EVP_hpke_p256_hkdf_sha256 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_hpke_p256_hkdf_sha256)
 #define _EVP_hpke_x25519_hkdf_sha256 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_hpke_x25519_hkdf_sha256)
 #define _EVP_marshal_digest_algorithm BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_marshal_digest_algorithm)
 #define _EVP_marshal_private_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, EVP_marshal_private_key)
@@ -1345,6 +1367,18 @@
 #define _MD5_Update BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MD5_Update)
 #define _METHOD_ref BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, METHOD_ref)
 #define _METHOD_unref BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, METHOD_unref)
+#define _MLDSA65_generate_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_generate_key)
+#define _MLDSA65_generate_key_external_entropy BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_generate_key_external_entropy)
+#define _MLDSA65_marshal_private_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_marshal_private_key)
+#define _MLDSA65_marshal_public_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_marshal_public_key)
+#define _MLDSA65_parse_private_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_parse_private_key)
+#define _MLDSA65_parse_public_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_parse_public_key)
+#define _MLDSA65_private_key_from_seed BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_private_key_from_seed)
+#define _MLDSA65_public_from_private BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_public_from_private)
+#define _MLDSA65_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_sign)
+#define _MLDSA65_sign_internal BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_sign_internal)
+#define _MLDSA65_verify BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_verify)
+#define _MLDSA65_verify_internal BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, MLDSA65_verify_internal)
 #define _NAME_CONSTRAINTS_check BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NAME_CONSTRAINTS_check)
 #define _NAME_CONSTRAINTS_free BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NAME_CONSTRAINTS_free)
 #define _NAME_CONSTRAINTS_it BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NAME_CONSTRAINTS_it)
@@ -1370,6 +1404,8 @@
 #define _NOTICEREF_free BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NOTICEREF_free)
 #define _NOTICEREF_it BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NOTICEREF_it)
 #define _NOTICEREF_new BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, NOTICEREF_new)
+#define _OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER)
+#define _OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER)
 #define _OBJ_cbs2nid BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OBJ_cbs2nid)
 #define _OBJ_cleanup BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OBJ_cleanup)
 #define _OBJ_cmp BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OBJ_cmp)
@@ -1409,6 +1445,7 @@
 #define _OPENSSL_gmtime_diff BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_gmtime_diff)
 #define _OPENSSL_hash32 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_hash32)
 #define _OPENSSL_ia32cap_P BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_ia32cap_P)
+#define _OPENSSL_init_cpuid BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_init_cpuid)
 #define _OPENSSL_init_crypto BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_init_crypto)
 #define _OPENSSL_isalnum BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_isalnum)
 #define _OPENSSL_isalpha BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, OPENSSL_isalpha)
@@ -1618,7 +1655,6 @@
 #define _RAND_SSLeay BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_SSLeay)
 #define _RAND_add BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_add)
 #define _RAND_bytes BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_bytes)
-#define _RAND_bytes_with_additional_data BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_bytes_with_additional_data)
 #define _RAND_cleanup BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_cleanup)
 #define _RAND_disable_fork_unsafe_buffering BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_disable_fork_unsafe_buffering)
 #define _RAND_egd BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, RAND_egd)
@@ -1742,6 +1778,10 @@
 #define _SPAKE2_CTX_new BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPAKE2_CTX_new)
 #define _SPAKE2_generate_msg BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPAKE2_generate_msg)
 #define _SPAKE2_process_msg BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPAKE2_process_msg)
+#define _SPX_generate_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPX_generate_key)
+#define _SPX_generate_key_from_seed BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPX_generate_key_from_seed)
+#define _SPX_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPX_sign)
+#define _SPX_verify BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SPX_verify)
 #define _SSLeay BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SSLeay)
 #define _SSLeay_version BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, SSLeay_version)
 #define _TRUST_TOKEN_CLIENT_add_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, TRUST_TOKEN_CLIENT_add_key)
@@ -2070,11 +2110,9 @@
 #define _X509_STORE_load_locations BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_load_locations)
 #define _X509_STORE_new BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_new)
 #define _X509_STORE_set1_param BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set1_param)
-#define _X509_STORE_set_check_crl BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_check_crl)
 #define _X509_STORE_set_default_paths BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_default_paths)
 #define _X509_STORE_set_depth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_depth)
 #define _X509_STORE_set_flags BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_flags)
-#define _X509_STORE_set_get_crl BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_get_crl)
 #define _X509_STORE_set_purpose BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_purpose)
 #define _X509_STORE_set_trust BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_trust)
 #define _X509_STORE_set_verify_cb BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, X509_STORE_set_verify_cb)
@@ -2254,8 +2292,11 @@
 #define _aes_hw_decrypt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_decrypt)
 #define _aes_hw_ecb_encrypt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_ecb_encrypt)
 #define _aes_hw_encrypt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_encrypt)
+#define _aes_hw_encrypt_key_to_decrypt_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_encrypt_key_to_decrypt_key)
 #define _aes_hw_set_decrypt_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_set_decrypt_key)
 #define _aes_hw_set_encrypt_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_set_encrypt_key)
+#define _aes_hw_set_encrypt_key_alt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_set_encrypt_key_alt)
+#define _aes_hw_set_encrypt_key_base BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_hw_set_encrypt_key_base)
 #define _aes_nohw_cbc_encrypt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_nohw_cbc_encrypt)
 #define _aes_nohw_ctr32_encrypt_blocks BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_nohw_ctr32_encrypt_blocks)
 #define _aes_nohw_decrypt BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, aes_nohw_decrypt)
@@ -2327,19 +2368,22 @@
 #define _bn_mont_ctx_set_RR_consttime BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mont_ctx_set_RR_consttime)
 #define _bn_mont_n0 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mont_n0)
 #define _bn_mul4x_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul4x_mont)
+#define _bn_mul4x_mont_gather5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul4x_mont_gather5)
 #define _bn_mul_add_words BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_add_words)
 #define _bn_mul_comba4 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_comba4)
 #define _bn_mul_comba8 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_comba8)
 #define _bn_mul_consttime BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_consttime)
 #define _bn_mul_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_mont)
-#define _bn_mul_mont_gather5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_mont_gather5)
+#define _bn_mul_mont_gather5_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_mont_gather5_nohw)
 #define _bn_mul_mont_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_mont_nohw)
 #define _bn_mul_small BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_small)
 #define _bn_mul_words BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mul_words)
 #define _bn_mulx4x_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mulx4x_mont)
+#define _bn_mulx4x_mont_gather5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_mulx4x_mont_gather5)
 #define _bn_odd_number_is_obviously_composite BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_odd_number_is_obviously_composite)
 #define _bn_one_to_montgomery BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_one_to_montgomery)
-#define _bn_power5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_power5)
+#define _bn_power5_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_power5_nohw)
+#define _bn_powerx5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_powerx5)
 #define _bn_rand_range_words BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_rand_range_words)
 #define _bn_rand_secret_range BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_rand_secret_range)
 #define _bn_reduce_once BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, bn_reduce_once)
@@ -2374,7 +2418,11 @@
 #define _c2i_ASN1_INTEGER BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, c2i_ASN1_INTEGER)
 #define _c2i_ASN1_OBJECT BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, c2i_ASN1_OBJECT)
 #define _chacha20_poly1305_open BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_open)
+#define _chacha20_poly1305_open_avx2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_open_avx2)
+#define _chacha20_poly1305_open_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_open_nohw)
 #define _chacha20_poly1305_seal BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_seal)
+#define _chacha20_poly1305_seal_avx2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_seal_avx2)
+#define _chacha20_poly1305_seal_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, chacha20_poly1305_seal_nohw)
 #define _crypto_gcm_clmul_enabled BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, crypto_gcm_clmul_enabled)
 #define _d2i_ASN1_BIT_STRING BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, d2i_ASN1_BIT_STRING)
 #define _d2i_ASN1_BMPSTRING BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, d2i_ASN1_BMPSTRING)
@@ -2483,8 +2531,10 @@
 #define _d2i_X509_VAL BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, d2i_X509_VAL)
 #define _d2i_X509_bio BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, d2i_X509_bio)
 #define _d2i_X509_fp BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, d2i_X509_fp)
+#define _dh_asn1_meth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dh_asn1_meth)
 #define _dh_check_params_fast BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dh_check_params_fast)
 #define _dh_compute_key_padded_no_self_test BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dh_compute_key_padded_no_self_test)
+#define _dh_pkey_meth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dh_pkey_meth)
 #define _dsa_asn1_meth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dsa_asn1_meth)
 #define _dsa_check_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, dsa_check_key)
 #define _ec_GFp_mont_add BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ec_GFp_mont_add)
@@ -2574,25 +2624,46 @@
 #define _ec_set_to_safe_point BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ec_set_to_safe_point)
 #define _ec_simple_scalar_inv0_montgomery BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ec_simple_scalar_inv0_montgomery)
 #define _ec_simple_scalar_to_montgomery_inv_vartime BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ec_simple_scalar_to_montgomery_inv_vartime)
-#define _ecdsa_do_verify_no_self_test BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_do_verify_no_self_test)
-#define _ecdsa_sign_with_nonce_for_known_answer_test BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_sign_with_nonce_for_known_answer_test)
-#define _ecp_nistz256_avx2_select_w7 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_avx2_select_w7)
+#define _ecdsa_sign_fixed BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_sign_fixed)
+#define _ecdsa_sign_fixed_with_nonce_for_known_answer_test BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_sign_fixed_with_nonce_for_known_answer_test)
+#define _ecdsa_verify_fixed BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_verify_fixed)
+#define _ecdsa_verify_fixed_no_self_test BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecdsa_verify_fixed_no_self_test)
 #define _ecp_nistz256_div_by_2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_div_by_2)
 #define _ecp_nistz256_mul_by_2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_mul_by_2)
 #define _ecp_nistz256_mul_by_3 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_mul_by_3)
 #define _ecp_nistz256_mul_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_mul_mont)
+#define _ecp_nistz256_mul_mont_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_mul_mont_adx)
+#define _ecp_nistz256_mul_mont_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_mul_mont_nohw)
 #define _ecp_nistz256_neg BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_neg)
 #define _ecp_nistz256_ord_mul_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont)
+#define _ecp_nistz256_ord_mul_mont_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont_adx)
+#define _ecp_nistz256_ord_mul_mont_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_mul_mont_nohw)
 #define _ecp_nistz256_ord_sqr_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont)
+#define _ecp_nistz256_ord_sqr_mont_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont_adx)
+#define _ecp_nistz256_ord_sqr_mont_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_ord_sqr_mont_nohw)
 #define _ecp_nistz256_point_add BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add)
+#define _ecp_nistz256_point_add_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add_adx)
 #define _ecp_nistz256_point_add_affine BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine)
+#define _ecp_nistz256_point_add_affine_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine_adx)
+#define _ecp_nistz256_point_add_affine_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add_affine_nohw)
+#define _ecp_nistz256_point_add_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_add_nohw)
 #define _ecp_nistz256_point_double BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_double)
+#define _ecp_nistz256_point_double_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_double_adx)
+#define _ecp_nistz256_point_double_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_point_double_nohw)
 #define _ecp_nistz256_select_w5 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w5)
+#define _ecp_nistz256_select_w5_avx2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w5_avx2)
+#define _ecp_nistz256_select_w5_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w5_nohw)
 #define _ecp_nistz256_select_w7 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w7)
+#define _ecp_nistz256_select_w7_avx2 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w7_avx2)
+#define _ecp_nistz256_select_w7_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_select_w7_nohw)
 #define _ecp_nistz256_sqr_mont BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont)
+#define _ecp_nistz256_sqr_mont_adx BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont_adx)
+#define _ecp_nistz256_sqr_mont_nohw BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_sqr_mont_nohw)
 #define _ecp_nistz256_sub BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ecp_nistz256_sub)
 #define _ed25519_asn1_meth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ed25519_asn1_meth)
 #define _ed25519_pkey_meth BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, ed25519_pkey_meth)
+#define _evp_md_md5_sha1 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, evp_md_md5_sha1)
+#define _evp_pkey_set_method BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, evp_pkey_set_method)
 #define _fiat_curve25519_adx_mul BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, fiat_curve25519_adx_mul)
 #define _fiat_curve25519_adx_square BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, fiat_curve25519_adx_square)
 #define _fiat_p256_adx_mul BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, fiat_p256_adx_mul)
@@ -2834,8 +2905,6 @@
 #define _spx_fors_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_fors_sign)
 #define _spx_fors_sk_gen BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_fors_sk_gen)
 #define _spx_fors_treehash BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_fors_treehash)
-#define _spx_generate_key BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_generate_key)
-#define _spx_generate_key_from_seed BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_generate_key_from_seed)
 #define _spx_get_tree_index BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_get_tree_index)
 #define _spx_ht_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_ht_sign)
 #define _spx_ht_verify BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_ht_verify)
@@ -2847,7 +2916,6 @@
 #define _spx_set_tree_height BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_set_tree_height)
 #define _spx_set_tree_index BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_set_tree_index)
 #define _spx_set_type BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_set_type)
-#define _spx_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_sign)
 #define _spx_thash_f BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_thash_f)
 #define _spx_thash_h BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_thash_h)
 #define _spx_thash_hmsg BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_thash_hmsg)
@@ -2858,12 +2926,12 @@
 #define _spx_to_uint64 BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_to_uint64)
 #define _spx_treehash BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_treehash)
 #define _spx_uint64_to_len_bytes BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_uint64_to_len_bytes)
-#define _spx_verify BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_verify)
 #define _spx_wots_pk_from_sig BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_wots_pk_from_sig)
 #define _spx_wots_pk_gen BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_wots_pk_gen)
 #define _spx_wots_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_wots_sign)
 #define _spx_xmss_pk_from_sig BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_xmss_pk_from_sig)
 #define _spx_xmss_sign BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, spx_xmss_sign)
+#define _swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE)
 #define _v2i_GENERAL_NAME BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, v2i_GENERAL_NAME)
 #define _v2i_GENERAL_NAMES BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, v2i_GENERAL_NAMES)
 #define _v2i_GENERAL_NAME_ex BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, v2i_GENERAL_NAME_ex)
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bytestring.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bytestring.h
index 65f9618d..3e8092a9 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bytestring.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_bytestring.h
@@ -639,6 +639,9 @@ OPENSSL_EXPORT int CBB_flush_asn1_set_of(CBB *cbb);
 
 
 // Unicode utilities.
+//
+// These functions consider noncharacters (see section 23.7 from Unicode 15.0.0)
+// to be invalid code points and will treat them as an error condition.
 
 // The following functions read one Unicode code point from |cbs| with the
 // corresponding encoding and store it in |*out|. They return one on success and
@@ -653,7 +656,9 @@ OPENSSL_EXPORT int CBS_get_utf32_be(CBS *cbs, uint32_t *out);
 OPENSSL_EXPORT size_t CBB_get_utf8_len(uint32_t u);
 
 // The following functions encode |u| to |cbb| with the corresponding
-// encoding. They return one on success and zero on error.
+// encoding. They return one on success and zero on error. Error conditions
+// include |u| being an invalid code point, or |u| being unencodable in the
+// specified encoding.
 OPENSSL_EXPORT int CBB_add_utf8(CBB *cbb, uint32_t u);
 OPENSSL_EXPORT int CBB_add_latin1(CBB *cbb, uint32_t u);
 OPENSSL_EXPORT int CBB_add_ucs2_be(CBB *cbb, uint32_t u);
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_crypto.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_crypto.h
index 15755064..f3cb46f0 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_crypto.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_crypto.h
@@ -32,18 +32,9 @@ extern "C" {
 #endif
 
 
-// crypto.h contains functions for initializing the crypto library.
+// crypto.h contains functions for library-wide initialization and properties.
 
 
-// CRYPTO_library_init initializes the crypto library. It must be called if the
-// library is built with BORINGSSL_NO_STATIC_INITIALIZER. Otherwise, it does
-// nothing and a static initializer is used instead. It is safe to call this
-// function multiple times and concurrently from multiple threads.
-//
-// On some ARM configurations, this function may require filesystem access and
-// should be called before entering a sandbox.
-OPENSSL_EXPORT void CRYPTO_library_init(void);
-
 // CRYPTO_is_confidential_build returns one if the linked version of BoringSSL
 // has been built with the BORINGSSL_CONFIDENTIAL define and zero otherwise.
 //
@@ -164,7 +155,7 @@ OPENSSL_EXPORT void OPENSSL_load_builtin_modules(void);
 #define OPENSSL_INIT_NO_LOAD_CONFIG 0
 #define OPENSSL_INIT_NO_ATEXIT 0
 
-// OPENSSL_init_crypto calls |CRYPTO_library_init| and returns one.
+// OPENSSL_init_crypto returns one.
 OPENSSL_EXPORT int OPENSSL_init_crypto(uint64_t opts,
                                        const OPENSSL_INIT_SETTINGS *settings);
 
@@ -178,6 +169,9 @@ OPENSSL_EXPORT int FIPS_mode_set(int on);
 // FIPS_module_name returns the name of the FIPS module.
 OPENSSL_EXPORT const char *FIPS_module_name(void);
 
+// FIPS_module_hash returns the 32-byte hash of the FIPS module.
+OPENSSL_EXPORT const uint8_t* FIPS_module_hash(void);
+
 // FIPS_version returns the version of the FIPS module, or zero if the build
 // isn't exactly at a verified version. The version, expressed in base 10, will
 // be a date in the form yyyymmddXX where XX is often "00", but can be
@@ -196,6 +190,10 @@ OPENSSL_EXPORT int FIPS_query_algorithm_status(const char *algorithm);
 OPENSSL_EXPORT int CRYPTO_has_broken_NEON(void);
 #endif
 
+// CRYPTO_library_init does nothing. Historically, it was needed in some build
+// configurations to initialization the library. This is no longer necessary.
+OPENSSL_EXPORT void CRYPTO_library_init(void);
+
 
 #if defined(__cplusplus)
 }  // extern C
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dh.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dh.h
index 96c1094f..f7399260 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dh.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dh.h
@@ -75,6 +75,12 @@ extern "C" {
 
 
 // Allocation and destruction.
+//
+// A |DH| object represents a Diffie-Hellman key or group parameters. A given
+// object may be used concurrently on multiple threads by non-mutating
+// functions, provided no other thread is concurrently calling a mutating
+// function. Unless otherwise documented, functions which take a |const| pointer
+// are non-mutating and functions which take a non-|const| pointer are mutating.
 
 // DH_new returns a new, empty DH object or NULL on error.
 OPENSSL_EXPORT DH *DH_new(void);
@@ -83,7 +89,8 @@ OPENSSL_EXPORT DH *DH_new(void);
 // count drops to zero.
 OPENSSL_EXPORT void DH_free(DH *dh);
 
-// DH_up_ref increments the reference count of |dh| and returns one.
+// DH_up_ref increments the reference count of |dh| and returns one. It does not
+// mutate |dh| for thread-safety purposes and may be used concurrently.
 OPENSSL_EXPORT int DH_up_ref(DH *dh);
 
 
@@ -214,6 +221,9 @@ OPENSSL_EXPORT int DH_generate_key(DH *dh);
 // Callers that expect a fixed-width secret should use this function over
 // |DH_compute_key|. Callers that use either function should migrate to a modern
 // primitive such as X25519 or ECDH with P-256 instead.
+//
+// This function does not mutate |dh| for thread-safety purposes and may be used
+// concurrently.
 OPENSSL_EXPORT int DH_compute_key_padded(uint8_t *out, const BIGNUM *peers_key,
                                          DH *dh);
 
@@ -225,6 +235,9 @@ OPENSSL_EXPORT int DH_compute_key_padded(uint8_t *out, const BIGNUM *peers_key,
 //
 // NOTE: this follows the usual BoringSSL return-value convention, but that's
 // different from |DH_compute_key| and |DH_compute_key_padded|.
+//
+// This function does not mutate |dh| for thread-safety purposes and may be used
+// concurrently.
 OPENSSL_EXPORT int DH_compute_key_hashed(DH *dh, uint8_t *out, size_t *out_len,
                                          size_t max_out_len,
                                          const BIGNUM *peers_key,
@@ -327,6 +340,9 @@ OPENSSL_EXPORT int i2d_DHparams(const DH *in, unsigned char **outp);
 // Callers that expect a fixed-width secret should use |DH_compute_key_padded|
 // instead. Callers that use either function should migrate to a modern
 // primitive such as X25519 or ECDH with P-256 instead.
+//
+// This function does not mutate |dh| for thread-safety purposes and may be used
+// concurrently.
 OPENSSL_EXPORT int DH_compute_key(uint8_t *out, const BIGNUM *peers_key,
                                   DH *dh);
 
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dsa.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dsa.h
index 077aaef1..a16f934b 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dsa.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_dsa.h
@@ -78,6 +78,12 @@ extern "C" {
 
 
 // Allocation and destruction.
+//
+// A |DSA| object represents a DSA key or group parameters. A given object may
+// be used concurrently on multiple threads by non-mutating functions, provided
+// no other thread is concurrently calling a mutating function. Unless otherwise
+// documented, functions which take a |const| pointer are non-mutating and
+// functions which take a non-|const| pointer are mutating.
 
 // DSA_new returns a new, empty DSA object or NULL on error.
 OPENSSL_EXPORT DSA *DSA_new(void);
@@ -86,7 +92,8 @@ OPENSSL_EXPORT DSA *DSA_new(void);
 // reference count drops to zero.
 OPENSSL_EXPORT void DSA_free(DSA *dsa);
 
-// DSA_up_ref increments the reference count of |dsa| and returns one.
+// DSA_up_ref increments the reference count of |dsa| and returns one. It does
+// not mutate |dsa| for thread-safety purposes and may be used concurrently.
 OPENSSL_EXPORT int DSA_up_ref(DSA *dsa);
 
 
@@ -216,7 +223,7 @@ OPENSSL_EXPORT DSA_SIG *DSA_do_sign(const uint8_t *digest, size_t digest_len,
 //
 // TODO(fork): deprecate.
 OPENSSL_EXPORT int DSA_do_verify(const uint8_t *digest, size_t digest_len,
-                                 DSA_SIG *sig, const DSA *dsa);
+                                 const DSA_SIG *sig, const DSA *dsa);
 
 // DSA_do_check_signature sets |*out_valid| to zero. Then it verifies that |sig|
 // is a valid signature, by the public key in |dsa| of the hash in |digest|
@@ -225,7 +232,7 @@ OPENSSL_EXPORT int DSA_do_verify(const uint8_t *digest, size_t digest_len,
 // It returns one if it was able to verify the signature as valid or invalid,
 // and zero on error.
 OPENSSL_EXPORT int DSA_do_check_signature(int *out_valid, const uint8_t *digest,
-                                          size_t digest_len, DSA_SIG *sig,
+                                          size_t digest_len, const DSA_SIG *sig,
                                           const DSA *dsa);
 
 
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp.h
index 828bc8d3..fa8c5b54 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp.h
@@ -136,10 +136,6 @@ OPENSSL_EXPORT int EVP_PKEY_bits(const EVP_PKEY *pkey);
 // values.
 OPENSSL_EXPORT int EVP_PKEY_id(const EVP_PKEY *pkey);
 
-// EVP_PKEY_type returns |nid| if |nid| is a known key type and |NID_undef|
-// otherwise.
-OPENSSL_EXPORT int EVP_PKEY_type(int nid);
-
 
 // Getting and setting concrete public key types.
 //
@@ -171,6 +167,11 @@ OPENSSL_EXPORT int EVP_PKEY_assign_EC_KEY(EVP_PKEY *pkey, EC_KEY *key);
 OPENSSL_EXPORT EC_KEY *EVP_PKEY_get0_EC_KEY(const EVP_PKEY *pkey);
 OPENSSL_EXPORT EC_KEY *EVP_PKEY_get1_EC_KEY(const EVP_PKEY *pkey);
 
+OPENSSL_EXPORT int EVP_PKEY_set1_DH(EVP_PKEY *pkey, DH *key);
+OPENSSL_EXPORT int EVP_PKEY_assign_DH(EVP_PKEY *pkey, DH *key);
+OPENSSL_EXPORT DH *EVP_PKEY_get0_DH(const EVP_PKEY *pkey);
+OPENSSL_EXPORT DH *EVP_PKEY_get1_DH(const EVP_PKEY *pkey);
+
 #define EVP_PKEY_NONE NID_undef
 #define EVP_PKEY_RSA NID_rsaEncryption
 #define EVP_PKEY_RSA_PSS NID_rsassaPss
@@ -179,6 +180,7 @@ OPENSSL_EXPORT EC_KEY *EVP_PKEY_get1_EC_KEY(const EVP_PKEY *pkey);
 #define EVP_PKEY_ED25519 NID_ED25519
 #define EVP_PKEY_X25519 NID_X25519
 #define EVP_PKEY_HKDF NID_hkdf
+#define EVP_PKEY_DH NID_dhKeyAgreement
 
 // EVP_PKEY_set_type sets the type of |pkey| to |type|. It returns one if
 // successful or zero if the |type| argument is not one of the |EVP_PKEY_*|
@@ -814,11 +816,23 @@ OPENSSL_EXPORT int EVP_PKEY_CTX_set_ec_paramgen_curve_nid(EVP_PKEY_CTX *ctx,
                                                           int nid);
 
 
-// Deprecated functions.
+// Diffie-Hellman-specific control functions.
 
-// EVP_PKEY_DH is defined for compatibility, but it is impossible to create an
-// |EVP_PKEY| of that type.
-#define EVP_PKEY_DH NID_dhKeyAgreement
+// EVP_PKEY_CTX_set_dh_pad configures configures whether |ctx|, which must be an
+// |EVP_PKEY_derive| operation, configures the handling of leading zeros in the
+// Diffie-Hellman shared secret. If |pad| is zero, leading zeros are removed
+// from the secret. If |pad| is non-zero, the fixed-width shared secret is used
+// unmodified, as in PKCS #3. If this function is not called, the default is to
+// remove leading zeros.
+//
+// WARNING: The behavior when |pad| is zero leaks information about the shared
+// secret. This may result in side channel attacks such as
+// https://raccoon-attack.com/, particularly when the same private key is used
+// for multiple operations.
+OPENSSL_EXPORT int EVP_PKEY_CTX_set_dh_pad(EVP_PKEY_CTX *ctx, int pad);
+
+
+// Deprecated functions.
 
 // EVP_PKEY_RSA2 was historically an alternate form for RSA public keys (OID
 // 2.5.8.1.1), but is no longer accepted.
@@ -917,12 +931,6 @@ OPENSSL_EXPORT EVP_PKEY *d2i_AutoPrivateKey(EVP_PKEY **out, const uint8_t **inp,
 OPENSSL_EXPORT EVP_PKEY *d2i_PublicKey(int type, EVP_PKEY **out,
                                        const uint8_t **inp, long len);
 
-// EVP_PKEY_get0_DH returns NULL.
-OPENSSL_EXPORT DH *EVP_PKEY_get0_DH(const EVP_PKEY *pkey);
-
-// EVP_PKEY_get1_DH returns NULL.
-OPENSSL_EXPORT DH *EVP_PKEY_get1_DH(const EVP_PKEY *pkey);
-
 // EVP_PKEY_CTX_set_ec_param_enc returns one if |encoding| is
 // |OPENSSL_EC_NAMED_CURVE| or zero with an error otherwise.
 OPENSSL_EXPORT int EVP_PKEY_CTX_set_ec_param_enc(EVP_PKEY_CTX *ctx,
@@ -1036,6 +1044,9 @@ OPENSSL_EXPORT int EVP_PKEY_CTX_set_dsa_paramgen_q_bits(EVP_PKEY_CTX *ctx,
 // Use the |EVP_PKEY_assign_*| functions instead.
 OPENSSL_EXPORT int EVP_PKEY_assign(EVP_PKEY *pkey, int type, void *key);
 
+// EVP_PKEY_type returns |nid|.
+OPENSSL_EXPORT int EVP_PKEY_type(int nid);
+
 
 // Preprocessor compatibility section (hidden).
 //
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp_errors.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp_errors.h
index 8583f521..163f17e2 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp_errors.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_evp_errors.h
@@ -95,5 +95,6 @@
 #define EVP_R_NOT_XOF_OR_INVALID_LENGTH 135
 #define EVP_R_EMPTY_PSK 136
 #define EVP_R_INVALID_BUFFER_SIZE 137
+#define EVP_R_EXPECTING_A_DH_KEY 138
 
 #endif  // OPENSSL_HEADER_EVP_ERRORS_H
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_hpke.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_hpke.h
index 32a942f9..caf048c0 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_hpke.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_hpke.h
@@ -40,12 +40,14 @@ extern "C" {
 // respectively.
 
 // The following constants are KEM identifiers.
+#define EVP_HPKE_DHKEM_P256_HKDF_SHA256 0x0010
 #define EVP_HPKE_DHKEM_X25519_HKDF_SHA256 0x0020
 
 // The following functions are KEM algorithms which may be used with HPKE. Note
 // that, while some HPKE KEMs use KDFs internally, this is separate from the
 // |EVP_HPKE_KDF| selection.
 OPENSSL_EXPORT const EVP_HPKE_KEM *EVP_hpke_x25519_hkdf_sha256(void);
+OPENSSL_EXPORT const EVP_HPKE_KEM *EVP_hpke_p256_hkdf_sha256(void);
 
 // EVP_HPKE_KEM_id returns the HPKE KEM identifier for |kem|, which
 // will be one of the |EVP_HPKE_KEM_*| constants.
@@ -53,7 +55,7 @@ OPENSSL_EXPORT uint16_t EVP_HPKE_KEM_id(const EVP_HPKE_KEM *kem);
 
 // EVP_HPKE_MAX_PUBLIC_KEY_LENGTH is the maximum length of an encoded public key
 // for all KEMs currently supported by this library.
-#define EVP_HPKE_MAX_PUBLIC_KEY_LENGTH 32
+#define EVP_HPKE_MAX_PUBLIC_KEY_LENGTH 65
 
 // EVP_HPKE_KEM_public_key_len returns the length of a public key for |kem|.
 // This value will be at most |EVP_HPKE_MAX_PUBLIC_KEY_LENGTH|.
@@ -69,7 +71,7 @@ OPENSSL_EXPORT size_t EVP_HPKE_KEM_private_key_len(const EVP_HPKE_KEM *kem);
 
 // EVP_HPKE_MAX_ENC_LENGTH is the maximum length of "enc", the encapsulated
 // shared secret, for all KEMs currently supported by this library.
-#define EVP_HPKE_MAX_ENC_LENGTH 32
+#define EVP_HPKE_MAX_ENC_LENGTH 65
 
 // EVP_HPKE_KEM_enc_len returns the length of the "enc", the encapsulated shared
 // secret, for |kem|. This value will be at most |EVP_HPKE_MAX_ENC_LENGTH|.
@@ -233,7 +235,7 @@ OPENSSL_EXPORT int EVP_HPKE_CTX_setup_sender(
 // EVP_HPKE_CTX_setup_sender_with_seed_for_testing behaves like
 // |EVP_HPKE_CTX_setup_sender|, but takes a seed to behave deterministically.
 // The seed's format depends on |kem|. For X25519, it is the sender's
-// ephemeral private key.
+// ephemeral private key. For P256, it's an HKDF input.
 OPENSSL_EXPORT int EVP_HPKE_CTX_setup_sender_with_seed_for_testing(
     EVP_HPKE_CTX *ctx, uint8_t *out_enc, size_t *out_enc_len, size_t max_enc,
     const EVP_HPKE_KEM *kem, const EVP_HPKE_KDF *kdf, const EVP_HPKE_AEAD *aead,
@@ -265,7 +267,7 @@ OPENSSL_EXPORT int EVP_HPKE_CTX_setup_auth_sender(
 // EVP_HPKE_CTX_setup_auth_sender_with_seed_for_testing behaves like
 // |EVP_HPKE_CTX_setup_auth_sender|, but takes a seed to behave
 // deterministically. The seed's format depends on |kem|. For X25519, it is the
-// sender's ephemeral private key.
+// sender's ephemeral private key. For P256, it's an HKDF input.
 OPENSSL_EXPORT int EVP_HPKE_CTX_setup_auth_sender_with_seed_for_testing(
     EVP_HPKE_CTX *ctx, uint8_t *out_enc, size_t *out_enc_len, size_t max_enc,
     const EVP_HPKE_KEY *key, const EVP_HPKE_KDF *kdf, const EVP_HPKE_AEAD *aead,
@@ -375,8 +377,8 @@ struct evp_hpke_ctx_st {
 
 struct evp_hpke_key_st {
   const EVP_HPKE_KEM *kem;
-  uint8_t private_key[X25519_PRIVATE_KEY_LEN];
-  uint8_t public_key[X25519_PUBLIC_VALUE_LEN];
+  uint8_t private_key[EVP_HPKE_MAX_PRIVATE_KEY_LENGTH];
+  uint8_t public_key[EVP_HPKE_MAX_PUBLIC_KEY_LENGTH];
 };
 
 
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mldsa.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mldsa.h
new file mode 100644
index 00000000..80a70307
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mldsa.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2024, Google LLC
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_MLDSA_H_
+#define OPENSSL_HEADER_MLDSA_H_
+
+#include "CCryptoBoringSSL_base.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+// ML-DSA-65.
+//
+// This implements the Module-Lattice-Based Digital Signature Standard from
+// https://csrc.nist.gov/pubs/fips/204/final
+
+
+// MLDSA65_private_key contains an ML-DSA-65 private key. The contents of this
+// object should never leave the address space since the format is unstable.
+struct MLDSA65_private_key {
+  union {
+    uint8_t bytes[32 + 32 + 64 + 256 * 4 * (5 + 6 + 6)];
+    uint32_t alignment;
+  } opaque;
+};
+
+// MLDSA65_public_key contains an ML-DSA-65 public key. The contents of this
+// object should never leave the address space since the format is unstable.
+struct MLDSA65_public_key {
+  union {
+    uint8_t bytes[32 + 64 + 256 * 4 * 6];
+    uint32_t alignment;
+  } opaque;
+};
+
+// MLDSA65_PRIVATE_KEY_BYTES is the number of bytes in an encoded ML-DSA-65
+// private key.
+#define MLDSA65_PRIVATE_KEY_BYTES 4032
+
+// MLDSA65_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-DSA-65
+// public key.
+#define MLDSA65_PUBLIC_KEY_BYTES 1952
+
+// MLDSA65_SIGNATURE_BYTES is the number of bytes in an encoded ML-DSA-65
+// signature.
+#define MLDSA65_SIGNATURE_BYTES 3309
+
+// MLDSA_SEED_BYTES is the number of bytes in an ML-DSA seed value.
+#define MLDSA_SEED_BYTES 32
+
+// MLDSA65_generate_key generates a random public/private key pair, writes the
+// encoded public key to |out_encoded_public_key|, writes the seed to
+// |out_seed|, and sets |out_private_key| to the private key. Returns 1 on
+// success and 0 on allocation failure.
+OPENSSL_EXPORT int MLDSA65_generate_key(
+    uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES],
+    uint8_t out_seed[MLDSA_SEED_BYTES],
+    struct MLDSA65_private_key *out_private_key);
+
+// MLDSA65_private_key_from_seed regenerates a private key from a seed value
+// that was generated by |MLDSA65_generate_key|. Returns 1 on success and 0 on
+// allocation failure or if |seed_len| is incorrect.
+OPENSSL_EXPORT int MLDSA65_private_key_from_seed(
+    struct MLDSA65_private_key *out_private_key, const uint8_t *seed,
+    size_t seed_len);
+
+// MLDSA65_public_from_private sets |*out_public_key| to the public key that
+// corresponds to |private_key|. Returns 1 on success and 0 on failure.
+OPENSSL_EXPORT int MLDSA65_public_from_private(
+    struct MLDSA65_public_key *out_public_key,
+    const struct MLDSA65_private_key *private_key);
+
+// MLDSA65_sign generates a signature for the message |msg| of length
+// |msg_len| using |private_key| (following the randomized algorithm), and
+// writes the encoded signature to |out_encoded_signature|. The |context|
+// argument is also signed over and can be used to include implicit contextual
+// information that isn't included in |msg|. The same value of |context| must be
+// presented to |MLDSA65_verify| in order for the generated signature to be
+// considered valid. |context| and |context_len| may be |NULL| and 0 to use an
+// empty context (this is common). Returns 1 on success and 0 on failure.
+OPENSSL_EXPORT int MLDSA65_sign(
+    uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES],
+    const struct MLDSA65_private_key *private_key, const uint8_t *msg,
+    size_t msg_len, const uint8_t *context, size_t context_len);
+
+// MLDSA65_verify verifies that |signature| constitutes a valid
+// signature for the message |msg| of length |msg_len| using |public_key|. The
+// value of |context| must equal the value that was passed to |MLDSA65_sign|
+// when the signature was generated. Returns 1 on success or 0 on error.
+OPENSSL_EXPORT int MLDSA65_verify(const struct MLDSA65_public_key *public_key,
+                                  const uint8_t *signature,
+                                  size_t signature_len, const uint8_t *msg,
+                                  size_t msg_len, const uint8_t *context,
+                                  size_t context_len);
+
+
+// Serialisation of keys.
+
+// MLDSA65_marshal_public_key serializes |public_key| to |out| in the standard
+// format for ML-DSA-65 public keys. It returns 1 on success or 0 on
+// allocation error.
+OPENSSL_EXPORT int MLDSA65_marshal_public_key(
+    CBB *out, const struct MLDSA65_public_key *public_key);
+
+// MLDSA65_parse_public_key parses a public key, in the format generated by
+// |MLDSA65_marshal_public_key|, from |in| and writes the result to
+// |out_public_key|. It returns 1 on success or 0 on parse error or if
+// there are trailing bytes in |in|.
+OPENSSL_EXPORT int MLDSA65_parse_public_key(
+    struct MLDSA65_public_key *public_key, CBS *in);
+
+// MLDSA65_parse_private_key parses a private key, in the NIST format, from |in|
+// and writes the result to |out_private_key|. It returns 1 on success or 0 on
+// parse error or if there are trailing bytes in |in|.
+OPENSSL_EXPORT int MLDSA65_parse_private_key(
+    struct MLDSA65_private_key *private_key, CBS *in);
+
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
+
+#endif  // OPENSSL_HEADER_MLDSA_H_
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mlkem.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mlkem.h
new file mode 100644
index 00000000..4472aa39
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_mlkem.h
@@ -0,0 +1,246 @@
+/* Copyright (c) 2024, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_MLKEM_H
+#define OPENSSL_HEADER_MLKEM_H
+
+#include "CCryptoBoringSSL_base.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+// ML-KEM-768.
+//
+// This implements the Module-Lattice-Based Key-Encapsulation Mechanism from
+// https://csrc.nist.gov/pubs/fips/204/final
+
+
+// MLKEM768_public_key contains an ML-KEM-768 public key. The contents of this
+// object should never leave the address space since the format is unstable.
+struct MLKEM768_public_key {
+  union {
+    uint8_t bytes[512 * (3 + 9) + 32 + 32];
+    uint16_t alignment;
+  } opaque;
+};
+
+// MLKEM768_private_key contains an ML-KEM-768 private key. The contents of this
+// object should never leave the address space since the format is unstable.
+struct MLKEM768_private_key {
+  union {
+    uint8_t bytes[512 * (3 + 3 + 9) + 32 + 32 + 32];
+    uint16_t alignment;
+  } opaque;
+};
+
+// MLKEM768_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-KEM-768
+// public key.
+#define MLKEM768_PUBLIC_KEY_BYTES 1184
+
+// MLKEM_SEED_BYTES is the number of bytes in an ML-KEM seed.
+#define MLKEM_SEED_BYTES 64
+
+// MLKEM768_generate_key generates a random public/private key pair, writes the
+// encoded public key to |out_encoded_public_key| and sets |out_private_key| to
+// the private key. If |optional_out_seed| is not NULL then the seed used to
+// generate the private key is written to it.
+OPENSSL_EXPORT void MLKEM768_generate_key(
+    uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES],
+    uint8_t optional_out_seed[MLKEM_SEED_BYTES],
+    struct MLKEM768_private_key *out_private_key);
+
+// MLKEM768_private_key_from_seed derives a private key from a seed that was
+// generated by |MLKEM768_generate_key|. It fails and returns 0 if |seed_len| is
+// incorrect, otherwise it writes |*out_private_key| and returns 1.
+OPENSSL_EXPORT int MLKEM768_private_key_from_seed(
+    struct MLKEM768_private_key *out_private_key, const uint8_t *seed,
+    size_t seed_len);
+
+// MLKEM768_public_from_private sets |*out_public_key| to the public key that
+// corresponds to |private_key|. (This is faster than parsing the output of
+// |MLKEM768_generate_key| if, for some reason, you need to encapsulate to a key
+// that was just generated.)
+OPENSSL_EXPORT void MLKEM768_public_from_private(
+    struct MLKEM768_public_key *out_public_key,
+    const struct MLKEM768_private_key *private_key);
+
+// MLKEM768_CIPHERTEXT_BYTES is number of bytes in the ML-KEM-768 ciphertext.
+#define MLKEM768_CIPHERTEXT_BYTES 1088
+
+// MLKEM_SHARED_SECRET_BYTES is the number of bytes in an ML-KEM shared secret.
+#define MLKEM_SHARED_SECRET_BYTES 32
+
+// MLKEM768_encap encrypts a random shared secret for |public_key|, writes the
+// ciphertext to |out_ciphertext|, and writes the random shared secret to
+// |out_shared_secret|.
+OPENSSL_EXPORT void MLKEM768_encap(
+    uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES],
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const struct MLKEM768_public_key *public_key);
+
+// MLKEM768_decap decrypts a shared secret from |ciphertext| using |private_key|
+// and writes it to |out_shared_secret|. If |ciphertext_len| is incorrect it
+// returns 0, otherwise it returns 1. If |ciphertext| is invalid (but of the
+// correct length), |out_shared_secret| is filled with a key that will always be
+// the same for the same |ciphertext| and |private_key|, but which appears to be
+// random unless one has access to |private_key|. These alternatives occur in
+// constant time. Any subsequent symmetric encryption using |out_shared_secret|
+// must use an authenticated encryption scheme in order to discover the
+// decapsulation failure.
+OPENSSL_EXPORT int MLKEM768_decap(
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const uint8_t *ciphertext, size_t ciphertext_len,
+    const struct MLKEM768_private_key *private_key);
+
+
+// Serialisation of keys.
+
+// MLKEM768_marshal_public_key serializes |public_key| to |out| in the standard
+// format for ML-KEM-768 public keys. It returns one on success or zero on
+// allocation error.
+OPENSSL_EXPORT int MLKEM768_marshal_public_key(
+    CBB *out, const struct MLKEM768_public_key *public_key);
+
+// MLKEM768_parse_public_key parses a public key, in the format generated by
+// |MLKEM768_marshal_public_key|, from |in| and writes the result to
+// |out_public_key|. It returns one on success or zero on parse error or if
+// there are trailing bytes in |in|.
+OPENSSL_EXPORT int MLKEM768_parse_public_key(
+    struct MLKEM768_public_key *out_public_key, CBS *in);
+
+// MLKEM768_PRIVATE_KEY_BYTES is the length of the data produced by
+// |MLKEM768_marshal_private_key|.
+#define MLKEM768_PRIVATE_KEY_BYTES 2400
+
+// MLKEM768_parse_private_key parses a private key, in NIST's format for
+// private keys, from |in| and writes the result to |out_private_key|. It
+// returns one on success or zero on parse error or if there are trailing bytes
+// in |in|. This format is verbose and should be avoided. Private keys should be
+// stored as seeds and parsed using |MLKEM768_private_key_from_seed|.
+OPENSSL_EXPORT int MLKEM768_parse_private_key(
+    struct MLKEM768_private_key *out_private_key, CBS *in);
+
+
+// ML-KEM-1024
+//
+// ML-KEM-1024 also exists. You should prefer ML-KEM-768 where possible.
+
+// MLKEM1024_public_key contains an ML-KEM-1024 public key. The contents of this
+// object should never leave the address space since the format is unstable.
+struct MLKEM1024_public_key {
+  union {
+    uint8_t bytes[512 * (4 + 16) + 32 + 32];
+    uint16_t alignment;
+  } opaque;
+};
+
+// MLKEM1024_private_key contains a ML-KEM-1024 private key. The contents of
+// this object should never leave the address space since the format is
+// unstable.
+struct MLKEM1024_private_key {
+  union {
+    uint8_t bytes[512 * (4 + 4 + 16) + 32 + 32 + 32];
+    uint16_t alignment;
+  } opaque;
+};
+
+// MLKEM1024_PUBLIC_KEY_BYTES is the number of bytes in an encoded ML-KEM-1024
+// public key.
+#define MLKEM1024_PUBLIC_KEY_BYTES 1568
+
+// MLKEM1024_generate_key generates a random public/private key pair, writes the
+// encoded public key to |out_encoded_public_key| and sets |out_private_key| to
+// the private key. If |optional_out_seed| is not NULL then the seed used to
+// generate the private key is written to it.
+OPENSSL_EXPORT void MLKEM1024_generate_key(
+    uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES],
+    uint8_t optional_out_seed[MLKEM_SEED_BYTES],
+    struct MLKEM1024_private_key *out_private_key);
+
+// MLKEM1024_private_key_from_seed derives a private key from a seed that was
+// generated by |MLKEM1024_generate_key|. It fails and returns 0 if |seed_len|
+// is incorrect, otherwise it writes |*out_private_key| and returns 1.
+OPENSSL_EXPORT int MLKEM1024_private_key_from_seed(
+    struct MLKEM1024_private_key *out_private_key, const uint8_t *seed,
+    size_t seed_len);
+
+// MLKEM1024_public_from_private sets |*out_public_key| to the public key that
+// corresponds to |private_key|. (This is faster than parsing the output of
+// |MLKEM1024_generate_key| if, for some reason, you need to encapsulate to a
+// key that was just generated.)
+OPENSSL_EXPORT void MLKEM1024_public_from_private(
+    struct MLKEM1024_public_key *out_public_key,
+    const struct MLKEM1024_private_key *private_key);
+
+// MLKEM1024_CIPHERTEXT_BYTES is number of bytes in the ML-KEM-1024 ciphertext.
+#define MLKEM1024_CIPHERTEXT_BYTES 1568
+
+// MLKEM1024_encap encrypts a random shared secret for |public_key|, writes the
+// ciphertext to |out_ciphertext|, and writes the random shared secret to
+// |out_shared_secret|.
+OPENSSL_EXPORT void MLKEM1024_encap(
+    uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES],
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const struct MLKEM1024_public_key *public_key);
+
+// MLKEM1024_decap decrypts a shared secret from |ciphertext| using
+// |private_key| and writes it to |out_shared_secret|. If |ciphertext_len| is
+// incorrect it returns 0, otherwise it returns 1. If |ciphertext| is invalid
+// (but of the correct length), |out_shared_secret| is filled with a key that
+// will always be the same for the same |ciphertext| and |private_key|, but
+// which appears to be random unless one has access to |private_key|. These
+// alternatives occur in constant time. Any subsequent symmetric encryption
+// using |out_shared_secret| must use an authenticated encryption scheme in
+// order to discover the decapsulation failure.
+OPENSSL_EXPORT int MLKEM1024_decap(
+    uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES],
+    const uint8_t *ciphertext, size_t ciphertext_len,
+    const struct MLKEM1024_private_key *private_key);
+
+
+// Serialisation of ML-KEM-1024 keys.
+
+// MLKEM1024_marshal_public_key serializes |public_key| to |out| in the standard
+// format for ML-KEM-1024 public keys. It returns one on success or zero on
+// allocation error.
+OPENSSL_EXPORT int MLKEM1024_marshal_public_key(
+    CBB *out, const struct MLKEM1024_public_key *public_key);
+
+// MLKEM1024_parse_public_key parses a public key, in the format generated by
+// |MLKEM1024_marshal_public_key|, from |in| and writes the result to
+// |out_public_key|. It returns one on success or zero on parse error or if
+// there are trailing bytes in |in|.
+OPENSSL_EXPORT int MLKEM1024_parse_public_key(
+    struct MLKEM1024_public_key *out_public_key, CBS *in);
+
+// MLKEM1024_PRIVATE_KEY_BYTES is the length of the data produced by
+// |MLKEM1024_marshal_private_key|.
+#define MLKEM1024_PRIVATE_KEY_BYTES 3168
+
+// MLKEM1024_parse_private_key parses a private key, in NIST's format for
+// private keys, from |in| and writes the result to |out_private_key|. It
+// returns one on success or zero on parse error or if there are trailing bytes
+// in |in|. This format is verbose and should be avoided. Private keys should be
+// stored as seeds and parsed using |MLKEM1024_private_key_from_seed|.
+OPENSSL_EXPORT int MLKEM1024_parse_private_key(
+    struct MLKEM1024_private_key *out_private_key, CBS *in);
+
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
+
+#endif  // OPENSSL_HEADER_MLKEM_H
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_nid.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_nid.h
index a1c03a07..521adfe6 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_nid.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_nid.h
@@ -4255,6 +4255,9 @@ extern "C" {
 #define SN_X25519Kyber768Draft00 "X25519Kyber768Draft00"
 #define NID_X25519Kyber768Draft00 964
 
+#define SN_X25519MLKEM768 "X25519MLKEM768"
+#define NID_X25519MLKEM768 965
+
 
 #if defined(__cplusplus)
 } /* extern C */
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_pem.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_pem.h
index b93b284c..443068a6 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_pem.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_pem.h
@@ -385,10 +385,9 @@ OPENSSL_EXPORT int PEM_ASN1_write(i2d_of_void *i2d, const char *name, FILE *fp,
                                   pem_password_cb *callback, void *u);
 
 // PEM_def_callback treats |userdata| as a string and copies it into |buf|,
-// assuming its |size| is sufficient. Returns the length of the string, or 0
-// if there is not enough room. If either |buf| or |userdata| is NULL, 0 is
-// returned. Note that this is different from OpenSSL, which prompts for a
-// password.
+// assuming its |size| is sufficient. Returns the length of the string, or -1 on
+// error. Error cases the buffer being too small, or |buf| and |userdata| being
+// NULL. Note that this is different from OpenSSL, which prompts for a password.
 OPENSSL_EXPORT int PEM_def_callback(char *buf, int size, int rwflag,
                                     void *userdata);
 
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_rand.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_rand.h
index fcac8f2c..5ca7fe94 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_rand.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_rand.h
@@ -33,20 +33,29 @@ OPENSSL_EXPORT int RAND_bytes(uint8_t *buf, size_t len);
 // Obscure functions.
 
 #if !defined(OPENSSL_WINDOWS)
-// RAND_enable_fork_unsafe_buffering enables efficient buffered reading of
-// /dev/urandom. It adds an overhead of a few KB of memory per thread. It must
-// be called before the first call to |RAND_bytes|.
+// RAND_enable_fork_unsafe_buffering indicates that clones of the address space,
+// e.g. via |fork|, will never call into BoringSSL. It may be used to disable
+// BoringSSL's more expensive fork-safety measures. However, calling this
+// function and then using BoringSSL across |fork| calls will leak secret keys.
+// |fd| must be -1.
 //
-// |fd| must be -1. We no longer support setting the file descriptor with this
-// function.
+// WARNING: This function affects BoringSSL for the entire address space. Thus
+// this function should never be called by library code, only by code with
+// global knowledge of the application's use of BoringSSL.
 //
-// It has an unusual name because the buffer is unsafe across calls to |fork|.
-// Hence, this function should never be called by libraries.
+// Do not use this function unless a performance issue was measured with the
+// default behavior. BoringSSL can efficiently detect forks on most platforms,
+// in which case this function is a no-op and is unnecessary. In particular,
+// Linux kernel versions 4.14 or later provide |MADV_WIPEONFORK|. Future
+// versions of BoringSSL will remove this functionality when older kernels are
+// sufficiently rare.
+//
+// This function has an unusual name because it historically controlled internal
+// buffers, but no longer does.
 OPENSSL_EXPORT void RAND_enable_fork_unsafe_buffering(int fd);
 
-// RAND_disable_fork_unsafe_buffering disables efficient buffered reading of
-// /dev/urandom, causing BoringSSL to always draw entropy on every request
-// for random bytes.
+// RAND_disable_fork_unsafe_buffering restores BoringSSL's default fork-safety
+// protections. See also |RAND_enable_fork_unsafe_buffering|.
 OPENSSL_EXPORT void RAND_disable_fork_unsafe_buffering(void);
 #endif
 
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_service_indicator.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_service_indicator.h
index f3a0c1cc..87625dc6 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_service_indicator.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_service_indicator.h
@@ -56,7 +56,7 @@ extern "C++" {
     return func;                                                \
   }()
 
-namespace bssl {
+BSSL_NAMESPACE_BEGIN
 
 enum class FIPSStatus {
   NOT_APPROVED = 0,
@@ -87,7 +87,7 @@ class FIPSIndicatorHelper {
   const uint64_t before_;
 };
 
-}  // namespace bssl
+BSSL_NAMESPACE_END
 }  // extern "C++"
 
 #endif  // !BORINGSSL_NO_CXX
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_sha.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_sha.h
index 471bda13..dff93f36 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_sha.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_sha.h
@@ -58,6 +58,7 @@
 #define OPENSSL_HEADER_SHA_H
 
 #include "CCryptoBoringSSL_base.h"
+#include "CCryptoBoringSSL_bcm_public.h" // IWYU pragma: export
 
 #if defined(__cplusplus)
 extern "C" {
@@ -112,29 +113,6 @@ OPENSSL_EXPORT void SHA1_Transform(SHA_CTX *sha,
 OPENSSL_EXPORT void CRYPTO_fips_186_2_prf(
     uint8_t *out, size_t out_len, const uint8_t xkey[SHA_DIGEST_LENGTH]);
 
-struct sha_state_st {
-#if defined(__cplusplus) || defined(OPENSSL_WINDOWS)
-  uint32_t h[5];
-#else
-  // wpa_supplicant accesses |h0|..|h4| so we must support those names for
-  // compatibility with it until it can be updated. Anonymous unions are only
-  // standard in C11, so disable this workaround in C++.
-  union {
-    uint32_t h[5];
-    struct {
-      uint32_t h0;
-      uint32_t h1;
-      uint32_t h2;
-      uint32_t h3;
-      uint32_t h4;
-    };
-  };
-#endif
-  uint32_t Nl, Nh;
-  uint8_t data[SHA_CBLOCK];
-  unsigned num;
-};
-
 
 // SHA-224.
 
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_span.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_span.h
index aa9396a7..90ee7bb1 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_span.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_span.h
@@ -30,6 +30,28 @@ extern "C++" {
 #include <string_view>
 #endif
 
+#if defined(__has_include)
+#if __has_include(<version>)
+#include <version>
+#endif
+#endif
+
+#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 201911L
+#include <ranges>
+BSSL_NAMESPACE_BEGIN
+template <typename T>
+class Span;
+BSSL_NAMESPACE_END
+
+// Mark `Span` as satisfying the `view` and `borrowed_range` concepts. This
+// should be done before the definition of `Span`, so that any inlined calls to
+// range functionality use the correct specializations.
+template <typename T>
+inline constexpr bool std::ranges::enable_view<bssl::Span<T>> = true;
+template <typename T>
+inline constexpr bool std::ranges::enable_borrowed_range<bssl::Span<T>> = true;
+#endif
+
 BSSL_NAMESPACE_BEGIN
 
 template <typename T>
@@ -49,6 +71,16 @@ class SpanBase {
 
   friend bool operator!=(Span<T> lhs, Span<T> rhs) { return !(lhs == rhs); }
 };
+
+// Heuristically test whether C is a container type that can be converted into
+// a Span<T> by checking for data() and size() member functions.
+//
+// TODO(davidben): Require C++17 support for std::is_convertible_v, etc.
+template <typename C, typename T>
+using EnableIfContainer = std::enable_if_t<
+    std::is_convertible<decltype(std::declval<C>().data()), T *>::value &&
+    std::is_integral<decltype(std::declval<C>().size())>::value>;
+
 }  // namespace internal
 
 // A Span<T> is a non-owning reference to a contiguous array of objects of type
@@ -84,16 +116,6 @@ class SpanBase {
 // a reference or pointer to a container or array.
 template <typename T>
 class Span : private internal::SpanBase<const T> {
- private:
-  // Heuristically test whether C is a container type that can be converted into
-  // a Span by checking for data() and size() member functions.
-  //
-  // TODO(davidben): Require C++17 support for std::is_convertible_v, etc.
-  template <typename C>
-  using EnableIfContainer = std::enable_if_t<
-      std::is_convertible<decltype(std::declval<C>().data()), T *>::value &&
-      std::is_integral<decltype(std::declval<C>().size())>::value>;
-
  public:
   static const size_t npos = static_cast<size_t>(-1);
 
@@ -114,12 +136,12 @@ class Span : private internal::SpanBase<const T> {
   template <size_t N>
   constexpr Span(T (&array)[N]) : Span(array, N) {}
 
-  template <typename C, typename = EnableIfContainer<C>,
+  template <typename C, typename = internal::EnableIfContainer<C, T>,
             typename = std::enable_if_t<std::is_const<T>::value, C>>
   constexpr Span(const C &container)
       : data_(container.data()), size_(container.size()) {}
 
-  template <typename C, typename = EnableIfContainer<C>,
+  template <typename C, typename = internal::EnableIfContainer<C, T>,
             typename = std::enable_if_t<!std::is_const<T>::value, C>>
   constexpr explicit Span(C &container)
       : data_(container.data()), size_(container.size()) {}
@@ -188,6 +210,20 @@ class Span : private internal::SpanBase<const T> {
 template <typename T>
 const size_t Span<T>::npos;
 
+#if __cplusplus >= 201703L
+template <typename T>
+Span(T *, size_t) -> Span<T>;
+template <typename T, size_t size>
+Span(T (&array)[size]) -> Span<T>;
+template <
+    typename C,
+    typename T = std::remove_pointer_t<decltype(std::declval<C>().data())>,
+    typename = internal::EnableIfContainer<C, T>>
+Span(C &) -> Span<T>;
+#endif
+
+// C++17 callers can instead rely on CTAD and the deduction guides defined
+// above.
 template <typename T>
 constexpr Span<T> MakeSpan(T *ptr, size_t size) {
   return Span<T>(ptr, size);
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_stack.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_stack.h
index 879be51b..be689841 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_stack.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_stack.h
@@ -139,7 +139,8 @@ STACK_OF(SAMPLE) *sk_SAMPLE_new(sk_SAMPLE_cmp_func comp);
 STACK_OF(SAMPLE) *sk_SAMPLE_new_null(void);
 
 // sk_SAMPLE_num returns the number of elements in |sk|. It is safe to cast this
-// value to |int|. |sk| is guaranteed to have at most |INT_MAX| elements.
+// value to |int|. |sk| is guaranteed to have at most |INT_MAX| elements. If
+// |sk| is NULL, it is treated as the empty list and this function returns zero.
 size_t sk_SAMPLE_num(const STACK_OF(SAMPLE) *sk);
 
 // sk_SAMPLE_zero resets |sk| to the empty state but does nothing to free the
@@ -147,7 +148,8 @@ size_t sk_SAMPLE_num(const STACK_OF(SAMPLE) *sk);
 void sk_SAMPLE_zero(STACK_OF(SAMPLE) *sk);
 
 // sk_SAMPLE_value returns the |i|th pointer in |sk|, or NULL if |i| is out of
-// range.
+// range. If |sk| is NULL, it is treated as an empty list and the function
+// returns NULL.
 SAMPLE *sk_SAMPLE_value(const STACK_OF(SAMPLE) *sk, size_t i);
 
 // sk_SAMPLE_set sets the |i|th pointer in |sk| to |p| and returns |p|. If |i|
@@ -195,7 +197,8 @@ void sk_SAMPLE_delete_if(STACK_OF(SAMPLE) *sk, sk_SAMPLE_delete_if_func func,
 // If the stack is sorted (see |sk_SAMPLE_sort|), this function uses a binary
 // search. Otherwise it performs a linear search. If it finds a matching
 // element, it writes the index to |*out_index| (if |out_index| is not NULL) and
-// returns one. Otherwise, it returns zero.
+// returns one. Otherwise, it returns zero. If |sk| is NULL, it is treated as
+// the empty list and the function returns zero.
 //
 // Note this differs from OpenSSL. The type signature is slightly different, and
 // OpenSSL's version will implicitly sort |sk| if it has a comparison function
@@ -399,6 +402,9 @@ BSSL_NAMESPACE_END
    * positive warning. */                                                      \
   OPENSSL_MSVC_PRAGMA(warning(push))                                           \
   OPENSSL_MSVC_PRAGMA(warning(disable : 4191))                                 \
+  OPENSSL_CLANG_PRAGMA("clang diagnostic push")                                \
+  OPENSSL_CLANG_PRAGMA("clang diagnostic ignored \"-Wunknown-warning-option\"") \
+  OPENSSL_CLANG_PRAGMA("clang diagnostic ignored \"-Wcast-function-type-strict\"") \
                                                                                \
   DECLARE_STACK_OF(name)                                                       \
                                                                                \
@@ -534,6 +540,7 @@ BSSL_NAMESPACE_END
         (OPENSSL_sk_free_func)free_func);                                      \
   }                                                                            \
                                                                                \
+  OPENSSL_CLANG_PRAGMA("clang diagnostic pop")                                 \
   OPENSSL_MSVC_PRAGMA(warning(pop))
 
 
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_target.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_target.h
index 29b1dc61..2760f52c 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_target.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_target.h
@@ -84,18 +84,18 @@
 
 // Trusty and Android baremetal aren't Linux but currently define __linux__.
 // As a workaround, we exclude them here.
-// We also exclude nanolibc/CrOS EC/Zephyr. nanolibc/CrOS EC/Zephyr
-// sometimes build for a non-Linux target (which should not define __linux__),
-// but also sometimes build for Linux. Although technically running in Linux
-// userspace, this lacks all the libc APIs we'd normally expect on Linux, so we
-// treat it as a non-Linux target.
+// We also exclude nanolibc/CrOS EC. nanolibc/CrOS EC sometimes build for a
+// non-Linux target (which should not define __linux__), but also sometimes
+// build for Linux. Although technically running in Linux userspace, this lacks
+// all the libc APIs we'd normally expect on Linux, so we treat it as a
+// non-Linux target.
 //
 // TODO(b/169780122): Remove this workaround once Trusty no longer defines it.
 // TODO(b/291101350): Remove this workaround once Android baremetal no longer
 // defines it.
 #if defined(__linux__) && !defined(__TRUSTY__) && \
     !defined(ANDROID_BAREMETAL) && !defined(OPENSSL_NANOLIBC) && \
-    !defined(CROS_EC) && !defined(CROS_ZEPHYR)
+    !defined(CROS_EC)
 #define OPENSSL_LINUX
 #endif
 
@@ -148,16 +148,19 @@
 #define OPENSSL_NO_THREADS_CORRUPT_MEMORY_AND_LEAK_SECRETS_IF_THREADED
 #endif
 
-// CROS_ZEPHYR is an embedded target for ChromeOS Zephyr Embedded Controller.
+// Zephyr is an open source RTOS, optimized for embedded devices.
 // Defining this on any other platform is not supported. Other embedded
 // platforms must introduce their own defines.
 //
-// https://chromium.googlesource.com/chromiumos/platform/ec/+/HEAD/docs/zephyr/README.md
-#if defined(CROS_ZEPHYR)
+// Zephyr supports multithreading with cooperative and preemptive scheduling.
+// It also implements POSIX Threads (pthread) API, so it's not necessary to
+// implement BoringSSL internal threading API using some custom API.
+//
+// https://www.zephyrproject.org/
+#if defined(__ZEPHYR__)
 #define OPENSSL_NO_FILESYSTEM
 #define OPENSSL_NO_POSIX_IO
 #define OPENSSL_NO_SOCK
-#define OPENSSL_NO_THREADS_CORRUPT_MEMORY_AND_LEAK_SECRETS_IF_THREADED
 #endif
 
 #if defined(__ANDROID_API__)
diff --git a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_x509.h b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_x509.h
index 202a42af..fe832502 100644
--- a/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_x509.h
+++ b/Sources/CCryptoBoringSSL/include/CCryptoBoringSSL_x509.h
@@ -254,9 +254,9 @@ OPENSSL_EXPORT void X509_get0_uids(const X509 *x509,
 // should not be accepted.
 #define EXFLAG_CRITICAL 0x200
 // EXFLAG_SS indicates the certificate is likely self-signed. That is, if it is
-// self-issued, its authority key identifer (if any) matches itself, and its key
-// usage extension (if any) allows certificate signatures. The signature itself
-// is not checked in computing this bit.
+// self-issued, its authority key identifier (if any) matches itself, and its
+// key usage extension (if any) allows certificate signatures. The signature
+// itself is not checked in computing this bit.
 #define EXFLAG_SS 0x2000
 
 // X509_get_extension_flags decodes a set of extensions from |x509| and returns
@@ -2696,8 +2696,18 @@ OPENSSL_EXPORT void X509_ALGOR_get0(const ASN1_OBJECT **out_obj,
 
 // X509_ALGOR_set_md sets |alg| to the hash function |md|. Note this
 // AlgorithmIdentifier represents the hash function itself, not a signature
-// algorithm that uses |md|.
-OPENSSL_EXPORT void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md);
+// algorithm that uses |md|. It returns one on success and zero on error.
+//
+// Due to historical specification mistakes (see Section 2.1 of RFC 4055), the
+// parameters field is sometimes omitted and sometimes a NULL value. When used
+// in RSASSA-PSS and RSAES-OAEP, it should be a NULL value. In other contexts,
+// the parameters should be omitted. This function assumes the caller is
+// constructing a RSASSA-PSS or RSAES-OAEP AlgorithmIdentifier and includes a
+// NULL parameter. This differs from OpenSSL's behavior.
+//
+// TODO(davidben): Rename this function, or perhaps just add a bespoke API for
+// constructing PSS and move on.
+OPENSSL_EXPORT int X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md);
 
 // X509_ALGOR_cmp returns zero if |a| and |b| are equal, and some non-zero value
 // otherwise. Note this function can only be used for equality checks, not an
@@ -3022,6 +3032,9 @@ OPENSSL_EXPORT int X509_STORE_CTX_init(X509_STORE_CTX *ctx, X509_STORE *store,
 // |X509_STORE_CTX_get1_chain| may be used to return the verified certificate
 // chain. On error, |X509_STORE_CTX_get_error| may be used to return additional
 // error information.
+//
+// WARNING: Most failure conditions from this function do not use the error
+// queue. Use |X509_STORE_CTX_get_error| to determine the cause of the error.
 OPENSSL_EXPORT int X509_verify_cert(X509_STORE_CTX *ctx);
 
 // X509_STORE_CTX_get0_chain, after a successful |X509_verify_cert| call,
@@ -5281,29 +5294,6 @@ OPENSSL_EXPORT void X509_STORE_set_verify_cb(
 #define X509_STORE_set_verify_cb_func(store, func) \
   X509_STORE_set_verify_cb((store), (func))
 
-typedef int (*X509_STORE_CTX_get_crl_fn)(X509_STORE_CTX *ctx, X509_CRL **crl,
-                                         X509 *x);
-typedef int (*X509_STORE_CTX_check_crl_fn)(X509_STORE_CTX *ctx, X509_CRL *crl);
-
-// X509_STORE_set_get_crl override's |store|'s logic for looking up CRLs.
-//
-// Do not use this function. It is temporarily retained to support one caller
-// and will be removed after that caller is fixed. It is not possible for
-// external callers to correctly implement this callback. The real
-// implementation sets some inaccessible internal state on |X509_STORE_CTX|.
-OPENSSL_EXPORT void X509_STORE_set_get_crl(X509_STORE *store,
-                                           X509_STORE_CTX_get_crl_fn get_crl);
-
-// X509_STORE_set_check_crl override's |store|'s logic for checking CRL
-// validity.
-//
-// Do not use this function. It is temporarily retained to support one caller
-// and will be removed after that caller is fixed. It is not possible for
-// external callers to correctly implement this callback. The real
-// implementation relies some inaccessible internal state on |X509_STORE_CTX|.
-OPENSSL_EXPORT void X509_STORE_set_check_crl(
-    X509_STORE *store, X509_STORE_CTX_check_crl_fn check_crl);
-
 // X509_STORE_CTX_set_chain configures |ctx| to use |sk| for untrusted
 // intermediate certificates to use in verification. This function is redundant
 // with the |chain| parameter of |X509_STORE_CTX_init|. Use the parameter
diff --git a/Sources/CCryptoBoringSSL/include/boringssl_prefix_symbols_nasm.inc b/Sources/CCryptoBoringSSL/include/boringssl_prefix_symbols_nasm.inc
index 2f360bd2..517fba6f 100644
--- a/Sources/CCryptoBoringSSL/include/boringssl_prefix_symbols_nasm.inc
+++ b/Sources/CCryptoBoringSSL/include/boringssl_prefix_symbols_nasm.inc
@@ -207,6 +207,14 @@
 %xdefine _BASIC_CONSTRAINTS_free _ %+ BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_free
 %xdefine _BASIC_CONSTRAINTS_it _ %+ BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_it
 %xdefine _BASIC_CONSTRAINTS_new _ %+ BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_new
+%xdefine _BCM_fips_186_2_prf _ %+ BORINGSSL_PREFIX %+ _BCM_fips_186_2_prf
+%xdefine _BCM_rand_bytes _ %+ BORINGSSL_PREFIX %+ _BCM_rand_bytes
+%xdefine _BCM_rand_bytes_hwrng _ %+ BORINGSSL_PREFIX %+ _BCM_rand_bytes_hwrng
+%xdefine _BCM_rand_bytes_with_additional_data _ %+ BORINGSSL_PREFIX %+ _BCM_rand_bytes_with_additional_data
+%xdefine _BCM_sha1_final _ %+ BORINGSSL_PREFIX %+ _BCM_sha1_final
+%xdefine _BCM_sha1_init _ %+ BORINGSSL_PREFIX %+ _BCM_sha1_init
+%xdefine _BCM_sha1_transform _ %+ BORINGSSL_PREFIX %+ _BCM_sha1_transform
+%xdefine _BCM_sha1_update _ %+ BORINGSSL_PREFIX %+ _BCM_sha1_update
 %xdefine _BIO_append_filename _ %+ BORINGSSL_PREFIX %+ _BIO_append_filename
 %xdefine _BIO_callback_ctrl _ %+ BORINGSSL_PREFIX %+ _BIO_callback_ctrl
 %xdefine _BIO_clear_flags _ %+ BORINGSSL_PREFIX %+ _BIO_clear_flags
@@ -713,6 +721,16 @@
 %xdefine _DH_size _ %+ BORINGSSL_PREFIX %+ _DH_size
 %xdefine _DH_up_ref _ %+ BORINGSSL_PREFIX %+ _DH_up_ref
 %xdefine _DHparams_dup _ %+ BORINGSSL_PREFIX %+ _DHparams_dup
+%xdefine _DILITHIUM_generate_key _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_generate_key
+%xdefine _DILITHIUM_generate_key_external_entropy _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_generate_key_external_entropy
+%xdefine _DILITHIUM_marshal_private_key _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_marshal_private_key
+%xdefine _DILITHIUM_marshal_public_key _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_marshal_public_key
+%xdefine _DILITHIUM_parse_private_key _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_parse_private_key
+%xdefine _DILITHIUM_parse_public_key _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_parse_public_key
+%xdefine _DILITHIUM_public_from_private _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_public_from_private
+%xdefine _DILITHIUM_sign _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_sign
+%xdefine _DILITHIUM_sign_deterministic _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_sign_deterministic
+%xdefine _DILITHIUM_verify _ %+ BORINGSSL_PREFIX %+ _DILITHIUM_verify
 %xdefine _DIRECTORYSTRING_free _ %+ BORINGSSL_PREFIX %+ _DIRECTORYSTRING_free
 %xdefine _DIRECTORYSTRING_it _ %+ BORINGSSL_PREFIX %+ _DIRECTORYSTRING_it
 %xdefine _DIRECTORYSTRING_new _ %+ BORINGSSL_PREFIX %+ _DIRECTORYSTRING_new
@@ -1094,6 +1112,7 @@
 %xdefine _EVP_PKEY_CTX_set0_rsa_oaep_label _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set0_rsa_oaep_label
 %xdefine _EVP_PKEY_CTX_set1_hkdf_key _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set1_hkdf_key
 %xdefine _EVP_PKEY_CTX_set1_hkdf_salt _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set1_hkdf_salt
+%xdefine _EVP_PKEY_CTX_set_dh_pad _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dh_pad
 %xdefine _EVP_PKEY_CTX_set_dsa_paramgen_bits _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dsa_paramgen_bits
 %xdefine _EVP_PKEY_CTX_set_dsa_paramgen_q_bits _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dsa_paramgen_q_bits
 %xdefine _EVP_PKEY_CTX_set_ec_param_enc _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_ec_param_enc
@@ -1110,6 +1129,7 @@
 %xdefine _EVP_PKEY_CTX_set_rsa_pss_saltlen _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_rsa_pss_saltlen
 %xdefine _EVP_PKEY_CTX_set_signature_md _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_signature_md
 %xdefine _EVP_PKEY_assign _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_assign
+%xdefine _EVP_PKEY_assign_DH _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_assign_DH
 %xdefine _EVP_PKEY_assign_DSA _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_assign_DSA
 %xdefine _EVP_PKEY_assign_EC_KEY _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_assign_EC_KEY
 %xdefine _EVP_PKEY_assign_RSA _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_assign_RSA
@@ -1151,6 +1171,7 @@
 %xdefine _EVP_PKEY_print_params _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_print_params
 %xdefine _EVP_PKEY_print_private _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_print_private
 %xdefine _EVP_PKEY_print_public _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_print_public
+%xdefine _EVP_PKEY_set1_DH _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_set1_DH
 %xdefine _EVP_PKEY_set1_DSA _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_set1_DSA
 %xdefine _EVP_PKEY_set1_EC_KEY _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_set1_EC_KEY
 %xdefine _EVP_PKEY_set1_RSA _ %+ BORINGSSL_PREFIX %+ _EVP_PKEY_set1_RSA
@@ -1235,6 +1256,7 @@
 %xdefine _EVP_hpke_aes_256_gcm _ %+ BORINGSSL_PREFIX %+ _EVP_hpke_aes_256_gcm
 %xdefine _EVP_hpke_chacha20_poly1305 _ %+ BORINGSSL_PREFIX %+ _EVP_hpke_chacha20_poly1305
 %xdefine _EVP_hpke_hkdf_sha256 _ %+ BORINGSSL_PREFIX %+ _EVP_hpke_hkdf_sha256
+%xdefine _EVP_hpke_p256_hkdf_sha256 _ %+ BORINGSSL_PREFIX %+ _EVP_hpke_p256_hkdf_sha256
 %xdefine _EVP_hpke_x25519_hkdf_sha256 _ %+ BORINGSSL_PREFIX %+ _EVP_hpke_x25519_hkdf_sha256
 %xdefine _EVP_marshal_digest_algorithm _ %+ BORINGSSL_PREFIX %+ _EVP_marshal_digest_algorithm
 %xdefine _EVP_marshal_private_key _ %+ BORINGSSL_PREFIX %+ _EVP_marshal_private_key
@@ -1337,6 +1359,18 @@
 %xdefine _MD5_Update _ %+ BORINGSSL_PREFIX %+ _MD5_Update
 %xdefine _METHOD_ref _ %+ BORINGSSL_PREFIX %+ _METHOD_ref
 %xdefine _METHOD_unref _ %+ BORINGSSL_PREFIX %+ _METHOD_unref
+%xdefine _MLDSA65_generate_key _ %+ BORINGSSL_PREFIX %+ _MLDSA65_generate_key
+%xdefine _MLDSA65_generate_key_external_entropy _ %+ BORINGSSL_PREFIX %+ _MLDSA65_generate_key_external_entropy
+%xdefine _MLDSA65_marshal_private_key _ %+ BORINGSSL_PREFIX %+ _MLDSA65_marshal_private_key
+%xdefine _MLDSA65_marshal_public_key _ %+ BORINGSSL_PREFIX %+ _MLDSA65_marshal_public_key
+%xdefine _MLDSA65_parse_private_key _ %+ BORINGSSL_PREFIX %+ _MLDSA65_parse_private_key
+%xdefine _MLDSA65_parse_public_key _ %+ BORINGSSL_PREFIX %+ _MLDSA65_parse_public_key
+%xdefine _MLDSA65_private_key_from_seed _ %+ BORINGSSL_PREFIX %+ _MLDSA65_private_key_from_seed
+%xdefine _MLDSA65_public_from_private _ %+ BORINGSSL_PREFIX %+ _MLDSA65_public_from_private
+%xdefine _MLDSA65_sign _ %+ BORINGSSL_PREFIX %+ _MLDSA65_sign
+%xdefine _MLDSA65_sign_internal _ %+ BORINGSSL_PREFIX %+ _MLDSA65_sign_internal
+%xdefine _MLDSA65_verify _ %+ BORINGSSL_PREFIX %+ _MLDSA65_verify
+%xdefine _MLDSA65_verify_internal _ %+ BORINGSSL_PREFIX %+ _MLDSA65_verify_internal
 %xdefine _NAME_CONSTRAINTS_check _ %+ BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_check
 %xdefine _NAME_CONSTRAINTS_free _ %+ BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_free
 %xdefine _NAME_CONSTRAINTS_it _ %+ BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_it
@@ -1362,6 +1396,8 @@
 %xdefine _NOTICEREF_free _ %+ BORINGSSL_PREFIX %+ _NOTICEREF_free
 %xdefine _NOTICEREF_it _ %+ BORINGSSL_PREFIX %+ _NOTICEREF_it
 %xdefine _NOTICEREF_new _ %+ BORINGSSL_PREFIX %+ _NOTICEREF_new
+%xdefine _OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER _ %+ BORINGSSL_PREFIX %+ _OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER
+%xdefine _OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER _ %+ BORINGSSL_PREFIX %+ _OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER
 %xdefine _OBJ_cbs2nid _ %+ BORINGSSL_PREFIX %+ _OBJ_cbs2nid
 %xdefine _OBJ_cleanup _ %+ BORINGSSL_PREFIX %+ _OBJ_cleanup
 %xdefine _OBJ_cmp _ %+ BORINGSSL_PREFIX %+ _OBJ_cmp
@@ -1401,6 +1437,7 @@
 %xdefine _OPENSSL_gmtime_diff _ %+ BORINGSSL_PREFIX %+ _OPENSSL_gmtime_diff
 %xdefine _OPENSSL_hash32 _ %+ BORINGSSL_PREFIX %+ _OPENSSL_hash32
 %xdefine _OPENSSL_ia32cap_P _ %+ BORINGSSL_PREFIX %+ _OPENSSL_ia32cap_P
+%xdefine _OPENSSL_init_cpuid _ %+ BORINGSSL_PREFIX %+ _OPENSSL_init_cpuid
 %xdefine _OPENSSL_init_crypto _ %+ BORINGSSL_PREFIX %+ _OPENSSL_init_crypto
 %xdefine _OPENSSL_isalnum _ %+ BORINGSSL_PREFIX %+ _OPENSSL_isalnum
 %xdefine _OPENSSL_isalpha _ %+ BORINGSSL_PREFIX %+ _OPENSSL_isalpha
@@ -1610,7 +1647,6 @@
 %xdefine _RAND_SSLeay _ %+ BORINGSSL_PREFIX %+ _RAND_SSLeay
 %xdefine _RAND_add _ %+ BORINGSSL_PREFIX %+ _RAND_add
 %xdefine _RAND_bytes _ %+ BORINGSSL_PREFIX %+ _RAND_bytes
-%xdefine _RAND_bytes_with_additional_data _ %+ BORINGSSL_PREFIX %+ _RAND_bytes_with_additional_data
 %xdefine _RAND_cleanup _ %+ BORINGSSL_PREFIX %+ _RAND_cleanup
 %xdefine _RAND_disable_fork_unsafe_buffering _ %+ BORINGSSL_PREFIX %+ _RAND_disable_fork_unsafe_buffering
 %xdefine _RAND_egd _ %+ BORINGSSL_PREFIX %+ _RAND_egd
@@ -1734,6 +1770,10 @@
 %xdefine _SPAKE2_CTX_new _ %+ BORINGSSL_PREFIX %+ _SPAKE2_CTX_new
 %xdefine _SPAKE2_generate_msg _ %+ BORINGSSL_PREFIX %+ _SPAKE2_generate_msg
 %xdefine _SPAKE2_process_msg _ %+ BORINGSSL_PREFIX %+ _SPAKE2_process_msg
+%xdefine _SPX_generate_key _ %+ BORINGSSL_PREFIX %+ _SPX_generate_key
+%xdefine _SPX_generate_key_from_seed _ %+ BORINGSSL_PREFIX %+ _SPX_generate_key_from_seed
+%xdefine _SPX_sign _ %+ BORINGSSL_PREFIX %+ _SPX_sign
+%xdefine _SPX_verify _ %+ BORINGSSL_PREFIX %+ _SPX_verify
 %xdefine _SSLeay _ %+ BORINGSSL_PREFIX %+ _SSLeay
 %xdefine _SSLeay_version _ %+ BORINGSSL_PREFIX %+ _SSLeay_version
 %xdefine _TRUST_TOKEN_CLIENT_add_key _ %+ BORINGSSL_PREFIX %+ _TRUST_TOKEN_CLIENT_add_key
@@ -2062,11 +2102,9 @@
 %xdefine _X509_STORE_load_locations _ %+ BORINGSSL_PREFIX %+ _X509_STORE_load_locations
 %xdefine _X509_STORE_new _ %+ BORINGSSL_PREFIX %+ _X509_STORE_new
 %xdefine _X509_STORE_set1_param _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set1_param
-%xdefine _X509_STORE_set_check_crl _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_check_crl
 %xdefine _X509_STORE_set_default_paths _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_default_paths
 %xdefine _X509_STORE_set_depth _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_depth
 %xdefine _X509_STORE_set_flags _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_flags
-%xdefine _X509_STORE_set_get_crl _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_get_crl
 %xdefine _X509_STORE_set_purpose _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_purpose
 %xdefine _X509_STORE_set_trust _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_trust
 %xdefine _X509_STORE_set_verify_cb _ %+ BORINGSSL_PREFIX %+ _X509_STORE_set_verify_cb
@@ -2246,8 +2284,11 @@
 %xdefine _aes_hw_decrypt _ %+ BORINGSSL_PREFIX %+ _aes_hw_decrypt
 %xdefine _aes_hw_ecb_encrypt _ %+ BORINGSSL_PREFIX %+ _aes_hw_ecb_encrypt
 %xdefine _aes_hw_encrypt _ %+ BORINGSSL_PREFIX %+ _aes_hw_encrypt
+%xdefine _aes_hw_encrypt_key_to_decrypt_key _ %+ BORINGSSL_PREFIX %+ _aes_hw_encrypt_key_to_decrypt_key
 %xdefine _aes_hw_set_decrypt_key _ %+ BORINGSSL_PREFIX %+ _aes_hw_set_decrypt_key
 %xdefine _aes_hw_set_encrypt_key _ %+ BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key
+%xdefine _aes_hw_set_encrypt_key_alt _ %+ BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key_alt
+%xdefine _aes_hw_set_encrypt_key_base _ %+ BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key_base
 %xdefine _aes_nohw_cbc_encrypt _ %+ BORINGSSL_PREFIX %+ _aes_nohw_cbc_encrypt
 %xdefine _aes_nohw_ctr32_encrypt_blocks _ %+ BORINGSSL_PREFIX %+ _aes_nohw_ctr32_encrypt_blocks
 %xdefine _aes_nohw_decrypt _ %+ BORINGSSL_PREFIX %+ _aes_nohw_decrypt
@@ -2319,19 +2360,22 @@
 %xdefine _bn_mont_ctx_set_RR_consttime _ %+ BORINGSSL_PREFIX %+ _bn_mont_ctx_set_RR_consttime
 %xdefine _bn_mont_n0 _ %+ BORINGSSL_PREFIX %+ _bn_mont_n0
 %xdefine _bn_mul4x_mont _ %+ BORINGSSL_PREFIX %+ _bn_mul4x_mont
+%xdefine _bn_mul4x_mont_gather5 _ %+ BORINGSSL_PREFIX %+ _bn_mul4x_mont_gather5
 %xdefine _bn_mul_add_words _ %+ BORINGSSL_PREFIX %+ _bn_mul_add_words
 %xdefine _bn_mul_comba4 _ %+ BORINGSSL_PREFIX %+ _bn_mul_comba4
 %xdefine _bn_mul_comba8 _ %+ BORINGSSL_PREFIX %+ _bn_mul_comba8
 %xdefine _bn_mul_consttime _ %+ BORINGSSL_PREFIX %+ _bn_mul_consttime
 %xdefine _bn_mul_mont _ %+ BORINGSSL_PREFIX %+ _bn_mul_mont
-%xdefine _bn_mul_mont_gather5 _ %+ BORINGSSL_PREFIX %+ _bn_mul_mont_gather5
+%xdefine _bn_mul_mont_gather5_nohw _ %+ BORINGSSL_PREFIX %+ _bn_mul_mont_gather5_nohw
 %xdefine _bn_mul_mont_nohw _ %+ BORINGSSL_PREFIX %+ _bn_mul_mont_nohw
 %xdefine _bn_mul_small _ %+ BORINGSSL_PREFIX %+ _bn_mul_small
 %xdefine _bn_mul_words _ %+ BORINGSSL_PREFIX %+ _bn_mul_words
 %xdefine _bn_mulx4x_mont _ %+ BORINGSSL_PREFIX %+ _bn_mulx4x_mont
+%xdefine _bn_mulx4x_mont_gather5 _ %+ BORINGSSL_PREFIX %+ _bn_mulx4x_mont_gather5
 %xdefine _bn_odd_number_is_obviously_composite _ %+ BORINGSSL_PREFIX %+ _bn_odd_number_is_obviously_composite
 %xdefine _bn_one_to_montgomery _ %+ BORINGSSL_PREFIX %+ _bn_one_to_montgomery
-%xdefine _bn_power5 _ %+ BORINGSSL_PREFIX %+ _bn_power5
+%xdefine _bn_power5_nohw _ %+ BORINGSSL_PREFIX %+ _bn_power5_nohw
+%xdefine _bn_powerx5 _ %+ BORINGSSL_PREFIX %+ _bn_powerx5
 %xdefine _bn_rand_range_words _ %+ BORINGSSL_PREFIX %+ _bn_rand_range_words
 %xdefine _bn_rand_secret_range _ %+ BORINGSSL_PREFIX %+ _bn_rand_secret_range
 %xdefine _bn_reduce_once _ %+ BORINGSSL_PREFIX %+ _bn_reduce_once
@@ -2366,7 +2410,11 @@
 %xdefine _c2i_ASN1_INTEGER _ %+ BORINGSSL_PREFIX %+ _c2i_ASN1_INTEGER
 %xdefine _c2i_ASN1_OBJECT _ %+ BORINGSSL_PREFIX %+ _c2i_ASN1_OBJECT
 %xdefine _chacha20_poly1305_open _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_open
+%xdefine _chacha20_poly1305_open_avx2 _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_open_avx2
+%xdefine _chacha20_poly1305_open_nohw _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_open_nohw
 %xdefine _chacha20_poly1305_seal _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_seal
+%xdefine _chacha20_poly1305_seal_avx2 _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_seal_avx2
+%xdefine _chacha20_poly1305_seal_nohw _ %+ BORINGSSL_PREFIX %+ _chacha20_poly1305_seal_nohw
 %xdefine _crypto_gcm_clmul_enabled _ %+ BORINGSSL_PREFIX %+ _crypto_gcm_clmul_enabled
 %xdefine _d2i_ASN1_BIT_STRING _ %+ BORINGSSL_PREFIX %+ _d2i_ASN1_BIT_STRING
 %xdefine _d2i_ASN1_BMPSTRING _ %+ BORINGSSL_PREFIX %+ _d2i_ASN1_BMPSTRING
@@ -2475,8 +2523,10 @@
 %xdefine _d2i_X509_VAL _ %+ BORINGSSL_PREFIX %+ _d2i_X509_VAL
 %xdefine _d2i_X509_bio _ %+ BORINGSSL_PREFIX %+ _d2i_X509_bio
 %xdefine _d2i_X509_fp _ %+ BORINGSSL_PREFIX %+ _d2i_X509_fp
+%xdefine _dh_asn1_meth _ %+ BORINGSSL_PREFIX %+ _dh_asn1_meth
 %xdefine _dh_check_params_fast _ %+ BORINGSSL_PREFIX %+ _dh_check_params_fast
 %xdefine _dh_compute_key_padded_no_self_test _ %+ BORINGSSL_PREFIX %+ _dh_compute_key_padded_no_self_test
+%xdefine _dh_pkey_meth _ %+ BORINGSSL_PREFIX %+ _dh_pkey_meth
 %xdefine _dsa_asn1_meth _ %+ BORINGSSL_PREFIX %+ _dsa_asn1_meth
 %xdefine _dsa_check_key _ %+ BORINGSSL_PREFIX %+ _dsa_check_key
 %xdefine _ec_GFp_mont_add _ %+ BORINGSSL_PREFIX %+ _ec_GFp_mont_add
@@ -2566,25 +2616,46 @@
 %xdefine _ec_set_to_safe_point _ %+ BORINGSSL_PREFIX %+ _ec_set_to_safe_point
 %xdefine _ec_simple_scalar_inv0_montgomery _ %+ BORINGSSL_PREFIX %+ _ec_simple_scalar_inv0_montgomery
 %xdefine _ec_simple_scalar_to_montgomery_inv_vartime _ %+ BORINGSSL_PREFIX %+ _ec_simple_scalar_to_montgomery_inv_vartime
-%xdefine _ecdsa_do_verify_no_self_test _ %+ BORINGSSL_PREFIX %+ _ecdsa_do_verify_no_self_test
-%xdefine _ecdsa_sign_with_nonce_for_known_answer_test _ %+ BORINGSSL_PREFIX %+ _ecdsa_sign_with_nonce_for_known_answer_test
-%xdefine _ecp_nistz256_avx2_select_w7 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_avx2_select_w7
+%xdefine _ecdsa_sign_fixed _ %+ BORINGSSL_PREFIX %+ _ecdsa_sign_fixed
+%xdefine _ecdsa_sign_fixed_with_nonce_for_known_answer_test _ %+ BORINGSSL_PREFIX %+ _ecdsa_sign_fixed_with_nonce_for_known_answer_test
+%xdefine _ecdsa_verify_fixed _ %+ BORINGSSL_PREFIX %+ _ecdsa_verify_fixed
+%xdefine _ecdsa_verify_fixed_no_self_test _ %+ BORINGSSL_PREFIX %+ _ecdsa_verify_fixed_no_self_test
 %xdefine _ecp_nistz256_div_by_2 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_div_by_2
 %xdefine _ecp_nistz256_mul_by_2 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_mul_by_2
 %xdefine _ecp_nistz256_mul_by_3 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_mul_by_3
 %xdefine _ecp_nistz256_mul_mont _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont
+%xdefine _ecp_nistz256_mul_mont_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont_adx
+%xdefine _ecp_nistz256_mul_mont_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont_nohw
 %xdefine _ecp_nistz256_neg _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_neg
 %xdefine _ecp_nistz256_ord_mul_mont _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont
+%xdefine _ecp_nistz256_ord_mul_mont_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont_adx
+%xdefine _ecp_nistz256_ord_mul_mont_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont_nohw
 %xdefine _ecp_nistz256_ord_sqr_mont _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont
+%xdefine _ecp_nistz256_ord_sqr_mont_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont_adx
+%xdefine _ecp_nistz256_ord_sqr_mont_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont_nohw
 %xdefine _ecp_nistz256_point_add _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add
+%xdefine _ecp_nistz256_point_add_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_adx
 %xdefine _ecp_nistz256_point_add_affine _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine
+%xdefine _ecp_nistz256_point_add_affine_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine_adx
+%xdefine _ecp_nistz256_point_add_affine_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine_nohw
+%xdefine _ecp_nistz256_point_add_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_nohw
 %xdefine _ecp_nistz256_point_double _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_double
+%xdefine _ecp_nistz256_point_double_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_double_adx
+%xdefine _ecp_nistz256_point_double_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_point_double_nohw
 %xdefine _ecp_nistz256_select_w5 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5
+%xdefine _ecp_nistz256_select_w5_avx2 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5_avx2
+%xdefine _ecp_nistz256_select_w5_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5_nohw
 %xdefine _ecp_nistz256_select_w7 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7
+%xdefine _ecp_nistz256_select_w7_avx2 _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7_avx2
+%xdefine _ecp_nistz256_select_w7_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7_nohw
 %xdefine _ecp_nistz256_sqr_mont _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont
+%xdefine _ecp_nistz256_sqr_mont_adx _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont_adx
+%xdefine _ecp_nistz256_sqr_mont_nohw _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont_nohw
 %xdefine _ecp_nistz256_sub _ %+ BORINGSSL_PREFIX %+ _ecp_nistz256_sub
 %xdefine _ed25519_asn1_meth _ %+ BORINGSSL_PREFIX %+ _ed25519_asn1_meth
 %xdefine _ed25519_pkey_meth _ %+ BORINGSSL_PREFIX %+ _ed25519_pkey_meth
+%xdefine _evp_md_md5_sha1 _ %+ BORINGSSL_PREFIX %+ _evp_md_md5_sha1
+%xdefine _evp_pkey_set_method _ %+ BORINGSSL_PREFIX %+ _evp_pkey_set_method
 %xdefine _fiat_curve25519_adx_mul _ %+ BORINGSSL_PREFIX %+ _fiat_curve25519_adx_mul
 %xdefine _fiat_curve25519_adx_square _ %+ BORINGSSL_PREFIX %+ _fiat_curve25519_adx_square
 %xdefine _fiat_p256_adx_mul _ %+ BORINGSSL_PREFIX %+ _fiat_p256_adx_mul
@@ -2826,8 +2897,6 @@
 %xdefine _spx_fors_sign _ %+ BORINGSSL_PREFIX %+ _spx_fors_sign
 %xdefine _spx_fors_sk_gen _ %+ BORINGSSL_PREFIX %+ _spx_fors_sk_gen
 %xdefine _spx_fors_treehash _ %+ BORINGSSL_PREFIX %+ _spx_fors_treehash
-%xdefine _spx_generate_key _ %+ BORINGSSL_PREFIX %+ _spx_generate_key
-%xdefine _spx_generate_key_from_seed _ %+ BORINGSSL_PREFIX %+ _spx_generate_key_from_seed
 %xdefine _spx_get_tree_index _ %+ BORINGSSL_PREFIX %+ _spx_get_tree_index
 %xdefine _spx_ht_sign _ %+ BORINGSSL_PREFIX %+ _spx_ht_sign
 %xdefine _spx_ht_verify _ %+ BORINGSSL_PREFIX %+ _spx_ht_verify
@@ -2839,7 +2908,6 @@
 %xdefine _spx_set_tree_height _ %+ BORINGSSL_PREFIX %+ _spx_set_tree_height
 %xdefine _spx_set_tree_index _ %+ BORINGSSL_PREFIX %+ _spx_set_tree_index
 %xdefine _spx_set_type _ %+ BORINGSSL_PREFIX %+ _spx_set_type
-%xdefine _spx_sign _ %+ BORINGSSL_PREFIX %+ _spx_sign
 %xdefine _spx_thash_f _ %+ BORINGSSL_PREFIX %+ _spx_thash_f
 %xdefine _spx_thash_h _ %+ BORINGSSL_PREFIX %+ _spx_thash_h
 %xdefine _spx_thash_hmsg _ %+ BORINGSSL_PREFIX %+ _spx_thash_hmsg
@@ -2850,12 +2918,12 @@
 %xdefine _spx_to_uint64 _ %+ BORINGSSL_PREFIX %+ _spx_to_uint64
 %xdefine _spx_treehash _ %+ BORINGSSL_PREFIX %+ _spx_treehash
 %xdefine _spx_uint64_to_len_bytes _ %+ BORINGSSL_PREFIX %+ _spx_uint64_to_len_bytes
-%xdefine _spx_verify _ %+ BORINGSSL_PREFIX %+ _spx_verify
 %xdefine _spx_wots_pk_from_sig _ %+ BORINGSSL_PREFIX %+ _spx_wots_pk_from_sig
 %xdefine _spx_wots_pk_gen _ %+ BORINGSSL_PREFIX %+ _spx_wots_pk_gen
 %xdefine _spx_wots_sign _ %+ BORINGSSL_PREFIX %+ _spx_wots_sign
 %xdefine _spx_xmss_pk_from_sig _ %+ BORINGSSL_PREFIX %+ _spx_xmss_pk_from_sig
 %xdefine _spx_xmss_sign _ %+ BORINGSSL_PREFIX %+ _spx_xmss_sign
+%xdefine _swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE _ %+ BORINGSSL_PREFIX %+ _swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE
 %xdefine _v2i_GENERAL_NAME _ %+ BORINGSSL_PREFIX %+ _v2i_GENERAL_NAME
 %xdefine _v2i_GENERAL_NAMES _ %+ BORINGSSL_PREFIX %+ _v2i_GENERAL_NAMES
 %xdefine _v2i_GENERAL_NAME_ex _ %+ BORINGSSL_PREFIX %+ _v2i_GENERAL_NAME_ex
@@ -3128,6 +3196,14 @@
 %xdefine BASIC_CONSTRAINTS_free BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_free
 %xdefine BASIC_CONSTRAINTS_it BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_it
 %xdefine BASIC_CONSTRAINTS_new BORINGSSL_PREFIX %+ _BASIC_CONSTRAINTS_new
+%xdefine BCM_fips_186_2_prf BORINGSSL_PREFIX %+ _BCM_fips_186_2_prf
+%xdefine BCM_rand_bytes BORINGSSL_PREFIX %+ _BCM_rand_bytes
+%xdefine BCM_rand_bytes_hwrng BORINGSSL_PREFIX %+ _BCM_rand_bytes_hwrng
+%xdefine BCM_rand_bytes_with_additional_data BORINGSSL_PREFIX %+ _BCM_rand_bytes_with_additional_data
+%xdefine BCM_sha1_final BORINGSSL_PREFIX %+ _BCM_sha1_final
+%xdefine BCM_sha1_init BORINGSSL_PREFIX %+ _BCM_sha1_init
+%xdefine BCM_sha1_transform BORINGSSL_PREFIX %+ _BCM_sha1_transform
+%xdefine BCM_sha1_update BORINGSSL_PREFIX %+ _BCM_sha1_update
 %xdefine BIO_append_filename BORINGSSL_PREFIX %+ _BIO_append_filename
 %xdefine BIO_callback_ctrl BORINGSSL_PREFIX %+ _BIO_callback_ctrl
 %xdefine BIO_clear_flags BORINGSSL_PREFIX %+ _BIO_clear_flags
@@ -3634,6 +3710,16 @@
 %xdefine DH_size BORINGSSL_PREFIX %+ _DH_size
 %xdefine DH_up_ref BORINGSSL_PREFIX %+ _DH_up_ref
 %xdefine DHparams_dup BORINGSSL_PREFIX %+ _DHparams_dup
+%xdefine DILITHIUM_generate_key BORINGSSL_PREFIX %+ _DILITHIUM_generate_key
+%xdefine DILITHIUM_generate_key_external_entropy BORINGSSL_PREFIX %+ _DILITHIUM_generate_key_external_entropy
+%xdefine DILITHIUM_marshal_private_key BORINGSSL_PREFIX %+ _DILITHIUM_marshal_private_key
+%xdefine DILITHIUM_marshal_public_key BORINGSSL_PREFIX %+ _DILITHIUM_marshal_public_key
+%xdefine DILITHIUM_parse_private_key BORINGSSL_PREFIX %+ _DILITHIUM_parse_private_key
+%xdefine DILITHIUM_parse_public_key BORINGSSL_PREFIX %+ _DILITHIUM_parse_public_key
+%xdefine DILITHIUM_public_from_private BORINGSSL_PREFIX %+ _DILITHIUM_public_from_private
+%xdefine DILITHIUM_sign BORINGSSL_PREFIX %+ _DILITHIUM_sign
+%xdefine DILITHIUM_sign_deterministic BORINGSSL_PREFIX %+ _DILITHIUM_sign_deterministic
+%xdefine DILITHIUM_verify BORINGSSL_PREFIX %+ _DILITHIUM_verify
 %xdefine DIRECTORYSTRING_free BORINGSSL_PREFIX %+ _DIRECTORYSTRING_free
 %xdefine DIRECTORYSTRING_it BORINGSSL_PREFIX %+ _DIRECTORYSTRING_it
 %xdefine DIRECTORYSTRING_new BORINGSSL_PREFIX %+ _DIRECTORYSTRING_new
@@ -4015,6 +4101,7 @@
 %xdefine EVP_PKEY_CTX_set0_rsa_oaep_label BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set0_rsa_oaep_label
 %xdefine EVP_PKEY_CTX_set1_hkdf_key BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set1_hkdf_key
 %xdefine EVP_PKEY_CTX_set1_hkdf_salt BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set1_hkdf_salt
+%xdefine EVP_PKEY_CTX_set_dh_pad BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dh_pad
 %xdefine EVP_PKEY_CTX_set_dsa_paramgen_bits BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dsa_paramgen_bits
 %xdefine EVP_PKEY_CTX_set_dsa_paramgen_q_bits BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_dsa_paramgen_q_bits
 %xdefine EVP_PKEY_CTX_set_ec_param_enc BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_ec_param_enc
@@ -4031,6 +4118,7 @@
 %xdefine EVP_PKEY_CTX_set_rsa_pss_saltlen BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_rsa_pss_saltlen
 %xdefine EVP_PKEY_CTX_set_signature_md BORINGSSL_PREFIX %+ _EVP_PKEY_CTX_set_signature_md
 %xdefine EVP_PKEY_assign BORINGSSL_PREFIX %+ _EVP_PKEY_assign
+%xdefine EVP_PKEY_assign_DH BORINGSSL_PREFIX %+ _EVP_PKEY_assign_DH
 %xdefine EVP_PKEY_assign_DSA BORINGSSL_PREFIX %+ _EVP_PKEY_assign_DSA
 %xdefine EVP_PKEY_assign_EC_KEY BORINGSSL_PREFIX %+ _EVP_PKEY_assign_EC_KEY
 %xdefine EVP_PKEY_assign_RSA BORINGSSL_PREFIX %+ _EVP_PKEY_assign_RSA
@@ -4072,6 +4160,7 @@
 %xdefine EVP_PKEY_print_params BORINGSSL_PREFIX %+ _EVP_PKEY_print_params
 %xdefine EVP_PKEY_print_private BORINGSSL_PREFIX %+ _EVP_PKEY_print_private
 %xdefine EVP_PKEY_print_public BORINGSSL_PREFIX %+ _EVP_PKEY_print_public
+%xdefine EVP_PKEY_set1_DH BORINGSSL_PREFIX %+ _EVP_PKEY_set1_DH
 %xdefine EVP_PKEY_set1_DSA BORINGSSL_PREFIX %+ _EVP_PKEY_set1_DSA
 %xdefine EVP_PKEY_set1_EC_KEY BORINGSSL_PREFIX %+ _EVP_PKEY_set1_EC_KEY
 %xdefine EVP_PKEY_set1_RSA BORINGSSL_PREFIX %+ _EVP_PKEY_set1_RSA
@@ -4156,6 +4245,7 @@
 %xdefine EVP_hpke_aes_256_gcm BORINGSSL_PREFIX %+ _EVP_hpke_aes_256_gcm
 %xdefine EVP_hpke_chacha20_poly1305 BORINGSSL_PREFIX %+ _EVP_hpke_chacha20_poly1305
 %xdefine EVP_hpke_hkdf_sha256 BORINGSSL_PREFIX %+ _EVP_hpke_hkdf_sha256
+%xdefine EVP_hpke_p256_hkdf_sha256 BORINGSSL_PREFIX %+ _EVP_hpke_p256_hkdf_sha256
 %xdefine EVP_hpke_x25519_hkdf_sha256 BORINGSSL_PREFIX %+ _EVP_hpke_x25519_hkdf_sha256
 %xdefine EVP_marshal_digest_algorithm BORINGSSL_PREFIX %+ _EVP_marshal_digest_algorithm
 %xdefine EVP_marshal_private_key BORINGSSL_PREFIX %+ _EVP_marshal_private_key
@@ -4258,6 +4348,18 @@
 %xdefine MD5_Update BORINGSSL_PREFIX %+ _MD5_Update
 %xdefine METHOD_ref BORINGSSL_PREFIX %+ _METHOD_ref
 %xdefine METHOD_unref BORINGSSL_PREFIX %+ _METHOD_unref
+%xdefine MLDSA65_generate_key BORINGSSL_PREFIX %+ _MLDSA65_generate_key
+%xdefine MLDSA65_generate_key_external_entropy BORINGSSL_PREFIX %+ _MLDSA65_generate_key_external_entropy
+%xdefine MLDSA65_marshal_private_key BORINGSSL_PREFIX %+ _MLDSA65_marshal_private_key
+%xdefine MLDSA65_marshal_public_key BORINGSSL_PREFIX %+ _MLDSA65_marshal_public_key
+%xdefine MLDSA65_parse_private_key BORINGSSL_PREFIX %+ _MLDSA65_parse_private_key
+%xdefine MLDSA65_parse_public_key BORINGSSL_PREFIX %+ _MLDSA65_parse_public_key
+%xdefine MLDSA65_private_key_from_seed BORINGSSL_PREFIX %+ _MLDSA65_private_key_from_seed
+%xdefine MLDSA65_public_from_private BORINGSSL_PREFIX %+ _MLDSA65_public_from_private
+%xdefine MLDSA65_sign BORINGSSL_PREFIX %+ _MLDSA65_sign
+%xdefine MLDSA65_sign_internal BORINGSSL_PREFIX %+ _MLDSA65_sign_internal
+%xdefine MLDSA65_verify BORINGSSL_PREFIX %+ _MLDSA65_verify
+%xdefine MLDSA65_verify_internal BORINGSSL_PREFIX %+ _MLDSA65_verify_internal
 %xdefine NAME_CONSTRAINTS_check BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_check
 %xdefine NAME_CONSTRAINTS_free BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_free
 %xdefine NAME_CONSTRAINTS_it BORINGSSL_PREFIX %+ _NAME_CONSTRAINTS_it
@@ -4283,6 +4385,8 @@
 %xdefine NOTICEREF_free BORINGSSL_PREFIX %+ _NOTICEREF_free
 %xdefine NOTICEREF_it BORINGSSL_PREFIX %+ _NOTICEREF_it
 %xdefine NOTICEREF_new BORINGSSL_PREFIX %+ _NOTICEREF_new
+%xdefine OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_PREFIX %+ _OBJC_CLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER
+%xdefine OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER BORINGSSL_PREFIX %+ _OBJC_METACLASS_$_swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLER_FINDER
 %xdefine OBJ_cbs2nid BORINGSSL_PREFIX %+ _OBJ_cbs2nid
 %xdefine OBJ_cleanup BORINGSSL_PREFIX %+ _OBJ_cleanup
 %xdefine OBJ_cmp BORINGSSL_PREFIX %+ _OBJ_cmp
@@ -4322,6 +4426,7 @@
 %xdefine OPENSSL_gmtime_diff BORINGSSL_PREFIX %+ _OPENSSL_gmtime_diff
 %xdefine OPENSSL_hash32 BORINGSSL_PREFIX %+ _OPENSSL_hash32
 %xdefine OPENSSL_ia32cap_P BORINGSSL_PREFIX %+ _OPENSSL_ia32cap_P
+%xdefine OPENSSL_init_cpuid BORINGSSL_PREFIX %+ _OPENSSL_init_cpuid
 %xdefine OPENSSL_init_crypto BORINGSSL_PREFIX %+ _OPENSSL_init_crypto
 %xdefine OPENSSL_isalnum BORINGSSL_PREFIX %+ _OPENSSL_isalnum
 %xdefine OPENSSL_isalpha BORINGSSL_PREFIX %+ _OPENSSL_isalpha
@@ -4531,7 +4636,6 @@
 %xdefine RAND_SSLeay BORINGSSL_PREFIX %+ _RAND_SSLeay
 %xdefine RAND_add BORINGSSL_PREFIX %+ _RAND_add
 %xdefine RAND_bytes BORINGSSL_PREFIX %+ _RAND_bytes
-%xdefine RAND_bytes_with_additional_data BORINGSSL_PREFIX %+ _RAND_bytes_with_additional_data
 %xdefine RAND_cleanup BORINGSSL_PREFIX %+ _RAND_cleanup
 %xdefine RAND_disable_fork_unsafe_buffering BORINGSSL_PREFIX %+ _RAND_disable_fork_unsafe_buffering
 %xdefine RAND_egd BORINGSSL_PREFIX %+ _RAND_egd
@@ -4655,6 +4759,10 @@
 %xdefine SPAKE2_CTX_new BORINGSSL_PREFIX %+ _SPAKE2_CTX_new
 %xdefine SPAKE2_generate_msg BORINGSSL_PREFIX %+ _SPAKE2_generate_msg
 %xdefine SPAKE2_process_msg BORINGSSL_PREFIX %+ _SPAKE2_process_msg
+%xdefine SPX_generate_key BORINGSSL_PREFIX %+ _SPX_generate_key
+%xdefine SPX_generate_key_from_seed BORINGSSL_PREFIX %+ _SPX_generate_key_from_seed
+%xdefine SPX_sign BORINGSSL_PREFIX %+ _SPX_sign
+%xdefine SPX_verify BORINGSSL_PREFIX %+ _SPX_verify
 %xdefine SSLeay BORINGSSL_PREFIX %+ _SSLeay
 %xdefine SSLeay_version BORINGSSL_PREFIX %+ _SSLeay_version
 %xdefine TRUST_TOKEN_CLIENT_add_key BORINGSSL_PREFIX %+ _TRUST_TOKEN_CLIENT_add_key
@@ -4983,11 +5091,9 @@
 %xdefine X509_STORE_load_locations BORINGSSL_PREFIX %+ _X509_STORE_load_locations
 %xdefine X509_STORE_new BORINGSSL_PREFIX %+ _X509_STORE_new
 %xdefine X509_STORE_set1_param BORINGSSL_PREFIX %+ _X509_STORE_set1_param
-%xdefine X509_STORE_set_check_crl BORINGSSL_PREFIX %+ _X509_STORE_set_check_crl
 %xdefine X509_STORE_set_default_paths BORINGSSL_PREFIX %+ _X509_STORE_set_default_paths
 %xdefine X509_STORE_set_depth BORINGSSL_PREFIX %+ _X509_STORE_set_depth
 %xdefine X509_STORE_set_flags BORINGSSL_PREFIX %+ _X509_STORE_set_flags
-%xdefine X509_STORE_set_get_crl BORINGSSL_PREFIX %+ _X509_STORE_set_get_crl
 %xdefine X509_STORE_set_purpose BORINGSSL_PREFIX %+ _X509_STORE_set_purpose
 %xdefine X509_STORE_set_trust BORINGSSL_PREFIX %+ _X509_STORE_set_trust
 %xdefine X509_STORE_set_verify_cb BORINGSSL_PREFIX %+ _X509_STORE_set_verify_cb
@@ -5167,8 +5273,11 @@
 %xdefine aes_hw_decrypt BORINGSSL_PREFIX %+ _aes_hw_decrypt
 %xdefine aes_hw_ecb_encrypt BORINGSSL_PREFIX %+ _aes_hw_ecb_encrypt
 %xdefine aes_hw_encrypt BORINGSSL_PREFIX %+ _aes_hw_encrypt
+%xdefine aes_hw_encrypt_key_to_decrypt_key BORINGSSL_PREFIX %+ _aes_hw_encrypt_key_to_decrypt_key
 %xdefine aes_hw_set_decrypt_key BORINGSSL_PREFIX %+ _aes_hw_set_decrypt_key
 %xdefine aes_hw_set_encrypt_key BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key
+%xdefine aes_hw_set_encrypt_key_alt BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key_alt
+%xdefine aes_hw_set_encrypt_key_base BORINGSSL_PREFIX %+ _aes_hw_set_encrypt_key_base
 %xdefine aes_nohw_cbc_encrypt BORINGSSL_PREFIX %+ _aes_nohw_cbc_encrypt
 %xdefine aes_nohw_ctr32_encrypt_blocks BORINGSSL_PREFIX %+ _aes_nohw_ctr32_encrypt_blocks
 %xdefine aes_nohw_decrypt BORINGSSL_PREFIX %+ _aes_nohw_decrypt
@@ -5240,19 +5349,22 @@
 %xdefine bn_mont_ctx_set_RR_consttime BORINGSSL_PREFIX %+ _bn_mont_ctx_set_RR_consttime
 %xdefine bn_mont_n0 BORINGSSL_PREFIX %+ _bn_mont_n0
 %xdefine bn_mul4x_mont BORINGSSL_PREFIX %+ _bn_mul4x_mont
+%xdefine bn_mul4x_mont_gather5 BORINGSSL_PREFIX %+ _bn_mul4x_mont_gather5
 %xdefine bn_mul_add_words BORINGSSL_PREFIX %+ _bn_mul_add_words
 %xdefine bn_mul_comba4 BORINGSSL_PREFIX %+ _bn_mul_comba4
 %xdefine bn_mul_comba8 BORINGSSL_PREFIX %+ _bn_mul_comba8
 %xdefine bn_mul_consttime BORINGSSL_PREFIX %+ _bn_mul_consttime
 %xdefine bn_mul_mont BORINGSSL_PREFIX %+ _bn_mul_mont
-%xdefine bn_mul_mont_gather5 BORINGSSL_PREFIX %+ _bn_mul_mont_gather5
+%xdefine bn_mul_mont_gather5_nohw BORINGSSL_PREFIX %+ _bn_mul_mont_gather5_nohw
 %xdefine bn_mul_mont_nohw BORINGSSL_PREFIX %+ _bn_mul_mont_nohw
 %xdefine bn_mul_small BORINGSSL_PREFIX %+ _bn_mul_small
 %xdefine bn_mul_words BORINGSSL_PREFIX %+ _bn_mul_words
 %xdefine bn_mulx4x_mont BORINGSSL_PREFIX %+ _bn_mulx4x_mont
+%xdefine bn_mulx4x_mont_gather5 BORINGSSL_PREFIX %+ _bn_mulx4x_mont_gather5
 %xdefine bn_odd_number_is_obviously_composite BORINGSSL_PREFIX %+ _bn_odd_number_is_obviously_composite
 %xdefine bn_one_to_montgomery BORINGSSL_PREFIX %+ _bn_one_to_montgomery
-%xdefine bn_power5 BORINGSSL_PREFIX %+ _bn_power5
+%xdefine bn_power5_nohw BORINGSSL_PREFIX %+ _bn_power5_nohw
+%xdefine bn_powerx5 BORINGSSL_PREFIX %+ _bn_powerx5
 %xdefine bn_rand_range_words BORINGSSL_PREFIX %+ _bn_rand_range_words
 %xdefine bn_rand_secret_range BORINGSSL_PREFIX %+ _bn_rand_secret_range
 %xdefine bn_reduce_once BORINGSSL_PREFIX %+ _bn_reduce_once
@@ -5287,7 +5399,11 @@
 %xdefine c2i_ASN1_INTEGER BORINGSSL_PREFIX %+ _c2i_ASN1_INTEGER
 %xdefine c2i_ASN1_OBJECT BORINGSSL_PREFIX %+ _c2i_ASN1_OBJECT
 %xdefine chacha20_poly1305_open BORINGSSL_PREFIX %+ _chacha20_poly1305_open
+%xdefine chacha20_poly1305_open_avx2 BORINGSSL_PREFIX %+ _chacha20_poly1305_open_avx2
+%xdefine chacha20_poly1305_open_nohw BORINGSSL_PREFIX %+ _chacha20_poly1305_open_nohw
 %xdefine chacha20_poly1305_seal BORINGSSL_PREFIX %+ _chacha20_poly1305_seal
+%xdefine chacha20_poly1305_seal_avx2 BORINGSSL_PREFIX %+ _chacha20_poly1305_seal_avx2
+%xdefine chacha20_poly1305_seal_nohw BORINGSSL_PREFIX %+ _chacha20_poly1305_seal_nohw
 %xdefine crypto_gcm_clmul_enabled BORINGSSL_PREFIX %+ _crypto_gcm_clmul_enabled
 %xdefine d2i_ASN1_BIT_STRING BORINGSSL_PREFIX %+ _d2i_ASN1_BIT_STRING
 %xdefine d2i_ASN1_BMPSTRING BORINGSSL_PREFIX %+ _d2i_ASN1_BMPSTRING
@@ -5396,8 +5512,10 @@
 %xdefine d2i_X509_VAL BORINGSSL_PREFIX %+ _d2i_X509_VAL
 %xdefine d2i_X509_bio BORINGSSL_PREFIX %+ _d2i_X509_bio
 %xdefine d2i_X509_fp BORINGSSL_PREFIX %+ _d2i_X509_fp
+%xdefine dh_asn1_meth BORINGSSL_PREFIX %+ _dh_asn1_meth
 %xdefine dh_check_params_fast BORINGSSL_PREFIX %+ _dh_check_params_fast
 %xdefine dh_compute_key_padded_no_self_test BORINGSSL_PREFIX %+ _dh_compute_key_padded_no_self_test
+%xdefine dh_pkey_meth BORINGSSL_PREFIX %+ _dh_pkey_meth
 %xdefine dsa_asn1_meth BORINGSSL_PREFIX %+ _dsa_asn1_meth
 %xdefine dsa_check_key BORINGSSL_PREFIX %+ _dsa_check_key
 %xdefine ec_GFp_mont_add BORINGSSL_PREFIX %+ _ec_GFp_mont_add
@@ -5487,25 +5605,46 @@
 %xdefine ec_set_to_safe_point BORINGSSL_PREFIX %+ _ec_set_to_safe_point
 %xdefine ec_simple_scalar_inv0_montgomery BORINGSSL_PREFIX %+ _ec_simple_scalar_inv0_montgomery
 %xdefine ec_simple_scalar_to_montgomery_inv_vartime BORINGSSL_PREFIX %+ _ec_simple_scalar_to_montgomery_inv_vartime
-%xdefine ecdsa_do_verify_no_self_test BORINGSSL_PREFIX %+ _ecdsa_do_verify_no_self_test
-%xdefine ecdsa_sign_with_nonce_for_known_answer_test BORINGSSL_PREFIX %+ _ecdsa_sign_with_nonce_for_known_answer_test
-%xdefine ecp_nistz256_avx2_select_w7 BORINGSSL_PREFIX %+ _ecp_nistz256_avx2_select_w7
+%xdefine ecdsa_sign_fixed BORINGSSL_PREFIX %+ _ecdsa_sign_fixed
+%xdefine ecdsa_sign_fixed_with_nonce_for_known_answer_test BORINGSSL_PREFIX %+ _ecdsa_sign_fixed_with_nonce_for_known_answer_test
+%xdefine ecdsa_verify_fixed BORINGSSL_PREFIX %+ _ecdsa_verify_fixed
+%xdefine ecdsa_verify_fixed_no_self_test BORINGSSL_PREFIX %+ _ecdsa_verify_fixed_no_self_test
 %xdefine ecp_nistz256_div_by_2 BORINGSSL_PREFIX %+ _ecp_nistz256_div_by_2
 %xdefine ecp_nistz256_mul_by_2 BORINGSSL_PREFIX %+ _ecp_nistz256_mul_by_2
 %xdefine ecp_nistz256_mul_by_3 BORINGSSL_PREFIX %+ _ecp_nistz256_mul_by_3
 %xdefine ecp_nistz256_mul_mont BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont
+%xdefine ecp_nistz256_mul_mont_adx BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont_adx
+%xdefine ecp_nistz256_mul_mont_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_mul_mont_nohw
 %xdefine ecp_nistz256_neg BORINGSSL_PREFIX %+ _ecp_nistz256_neg
 %xdefine ecp_nistz256_ord_mul_mont BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont
+%xdefine ecp_nistz256_ord_mul_mont_adx BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont_adx
+%xdefine ecp_nistz256_ord_mul_mont_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_ord_mul_mont_nohw
 %xdefine ecp_nistz256_ord_sqr_mont BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont
+%xdefine ecp_nistz256_ord_sqr_mont_adx BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont_adx
+%xdefine ecp_nistz256_ord_sqr_mont_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_ord_sqr_mont_nohw
 %xdefine ecp_nistz256_point_add BORINGSSL_PREFIX %+ _ecp_nistz256_point_add
+%xdefine ecp_nistz256_point_add_adx BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_adx
 %xdefine ecp_nistz256_point_add_affine BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine
+%xdefine ecp_nistz256_point_add_affine_adx BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine_adx
+%xdefine ecp_nistz256_point_add_affine_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_affine_nohw
+%xdefine ecp_nistz256_point_add_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_point_add_nohw
 %xdefine ecp_nistz256_point_double BORINGSSL_PREFIX %+ _ecp_nistz256_point_double
+%xdefine ecp_nistz256_point_double_adx BORINGSSL_PREFIX %+ _ecp_nistz256_point_double_adx
+%xdefine ecp_nistz256_point_double_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_point_double_nohw
 %xdefine ecp_nistz256_select_w5 BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5
+%xdefine ecp_nistz256_select_w5_avx2 BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5_avx2
+%xdefine ecp_nistz256_select_w5_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_select_w5_nohw
 %xdefine ecp_nistz256_select_w7 BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7
+%xdefine ecp_nistz256_select_w7_avx2 BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7_avx2
+%xdefine ecp_nistz256_select_w7_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_select_w7_nohw
 %xdefine ecp_nistz256_sqr_mont BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont
+%xdefine ecp_nistz256_sqr_mont_adx BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont_adx
+%xdefine ecp_nistz256_sqr_mont_nohw BORINGSSL_PREFIX %+ _ecp_nistz256_sqr_mont_nohw
 %xdefine ecp_nistz256_sub BORINGSSL_PREFIX %+ _ecp_nistz256_sub
 %xdefine ed25519_asn1_meth BORINGSSL_PREFIX %+ _ed25519_asn1_meth
 %xdefine ed25519_pkey_meth BORINGSSL_PREFIX %+ _ed25519_pkey_meth
+%xdefine evp_md_md5_sha1 BORINGSSL_PREFIX %+ _evp_md_md5_sha1
+%xdefine evp_pkey_set_method BORINGSSL_PREFIX %+ _evp_pkey_set_method
 %xdefine fiat_curve25519_adx_mul BORINGSSL_PREFIX %+ _fiat_curve25519_adx_mul
 %xdefine fiat_curve25519_adx_square BORINGSSL_PREFIX %+ _fiat_curve25519_adx_square
 %xdefine fiat_p256_adx_mul BORINGSSL_PREFIX %+ _fiat_p256_adx_mul
@@ -5747,8 +5886,6 @@
 %xdefine spx_fors_sign BORINGSSL_PREFIX %+ _spx_fors_sign
 %xdefine spx_fors_sk_gen BORINGSSL_PREFIX %+ _spx_fors_sk_gen
 %xdefine spx_fors_treehash BORINGSSL_PREFIX %+ _spx_fors_treehash
-%xdefine spx_generate_key BORINGSSL_PREFIX %+ _spx_generate_key
-%xdefine spx_generate_key_from_seed BORINGSSL_PREFIX %+ _spx_generate_key_from_seed
 %xdefine spx_get_tree_index BORINGSSL_PREFIX %+ _spx_get_tree_index
 %xdefine spx_ht_sign BORINGSSL_PREFIX %+ _spx_ht_sign
 %xdefine spx_ht_verify BORINGSSL_PREFIX %+ _spx_ht_verify
@@ -5760,7 +5897,6 @@
 %xdefine spx_set_tree_height BORINGSSL_PREFIX %+ _spx_set_tree_height
 %xdefine spx_set_tree_index BORINGSSL_PREFIX %+ _spx_set_tree_index
 %xdefine spx_set_type BORINGSSL_PREFIX %+ _spx_set_type
-%xdefine spx_sign BORINGSSL_PREFIX %+ _spx_sign
 %xdefine spx_thash_f BORINGSSL_PREFIX %+ _spx_thash_f
 %xdefine spx_thash_h BORINGSSL_PREFIX %+ _spx_thash_h
 %xdefine spx_thash_hmsg BORINGSSL_PREFIX %+ _spx_thash_hmsg
@@ -5771,12 +5907,12 @@
 %xdefine spx_to_uint64 BORINGSSL_PREFIX %+ _spx_to_uint64
 %xdefine spx_treehash BORINGSSL_PREFIX %+ _spx_treehash
 %xdefine spx_uint64_to_len_bytes BORINGSSL_PREFIX %+ _spx_uint64_to_len_bytes
-%xdefine spx_verify BORINGSSL_PREFIX %+ _spx_verify
 %xdefine spx_wots_pk_from_sig BORINGSSL_PREFIX %+ _spx_wots_pk_from_sig
 %xdefine spx_wots_pk_gen BORINGSSL_PREFIX %+ _spx_wots_pk_gen
 %xdefine spx_wots_sign BORINGSSL_PREFIX %+ _spx_wots_sign
 %xdefine spx_xmss_pk_from_sig BORINGSSL_PREFIX %+ _spx_xmss_pk_from_sig
 %xdefine spx_xmss_sign BORINGSSL_PREFIX %+ _spx_xmss_sign
+%xdefine swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE BORINGSSL_PREFIX %+ _swift_crypto_CCryptoBoringSSL_SWIFTPM_MODULE_BUNDLE
 %xdefine v2i_GENERAL_NAME BORINGSSL_PREFIX %+ _v2i_GENERAL_NAME
 %xdefine v2i_GENERAL_NAMES BORINGSSL_PREFIX %+ _v2i_GENERAL_NAMES
 %xdefine v2i_GENERAL_NAME_ex BORINGSSL_PREFIX %+ _v2i_GENERAL_NAME_ex
diff --git a/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_dilithium.h b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_dilithium.h
new file mode 100644
index 00000000..5abd7387
--- /dev/null
+++ b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_dilithium.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2023, Google LLC
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_DILITHIUM_H
+#define OPENSSL_HEADER_DILITHIUM_H
+
+#include "CCryptoBoringSSL_base.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+#if defined(OPENSSL_UNSTABLE_EXPERIMENTAL_DILITHIUM)
+// The ML-DSA spec has now been standardized and ML-DSA is available in
+// BoringSSL.  This code should no longer be used. It was intended for
+// short-lived experiments and must not have been deployed anywhere durable.  If
+// you were using this you need to use the <openssl/mldsa.h> instead. This
+// header and code will be removed from BoringSSL soon.
+
+// Dilithium3.
+
+// DILITHIUM_private_key contains a Dilithium3 private key. The contents of this
+// object should never leave the address space since the format is unstable.
+struct DILITHIUM_private_key {
+  union {
+    uint8_t bytes[32 + 32 + 64 + 256 * 4 * (5 + 6 + 6)];
+    uint32_t alignment;
+  } opaque;
+};
+
+// DILITHIUM_public_key contains a Dilithium3 public key. The contents of this
+// object should never leave the address space since the format is unstable.
+struct DILITHIUM_public_key {
+  union {
+    uint8_t bytes[32 + 64 + 256 * 4 * 6];
+    uint32_t alignment;
+  } opaque;
+};
+
+// DILITHIUM_PRIVATE_KEY_BYTES is the number of bytes in an encoded Dilithium3
+// private key.
+#define DILITHIUM_PRIVATE_KEY_BYTES 4032
+
+// DILITHIUM_PUBLIC_KEY_BYTES is the number of bytes in an encoded Dilithium3
+// public key.
+#define DILITHIUM_PUBLIC_KEY_BYTES 1952
+
+// DILITHIUM_SIGNATURE_BYTES is the number of bytes in an encoded Dilithium3
+// signature.
+#define DILITHIUM_SIGNATURE_BYTES 3309
+
+// DILITHIUM_generate_key generates a random public/private key pair, writes the
+// encoded public key to |out_encoded_public_key| and sets |out_private_key| to
+// the private key. Returns 1 on success and 0 on failure.
+OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_generate_key(
+    uint8_t out_encoded_public_key[DILITHIUM_PUBLIC_KEY_BYTES],
+    struct DILITHIUM_private_key *out_private_key);
+
+// DILITHIUM_public_from_private sets |*out_public_key| to the public key that
+// corresponds to |private_key|. Returns 1 on success and 0 on failure.
+OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_public_from_private(
+    struct DILITHIUM_public_key *out_public_key,
+    const struct DILITHIUM_private_key *private_key);
+
+// DILITHIUM_sign generates a signature for the message |msg| of length
+// |msg_len| using |private_key| following the randomized algorithm, and writes
+// the encoded signature to |out_encoded_signature|. Returns 1 on success and 0
+// on failure.
+OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_sign(
+    uint8_t out_encoded_signature[DILITHIUM_SIGNATURE_BYTES],
+    const struct DILITHIUM_private_key *private_key, const uint8_t *msg,
+    size_t msg_len);
+
+// DILITHIUM_verify verifies that |encoded_signature| constitutes a valid
+// signature for the message |msg| of length |msg_len| using |public_key|.
+OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_verify(
+    const struct DILITHIUM_public_key *public_key,
+    const uint8_t encoded_signature[DILITHIUM_SIGNATURE_BYTES],
+    const uint8_t *msg, size_t msg_len);
+
+
+// Serialisation of keys.
+
+// DILITHIUM_marshal_public_key serializes |public_key| to |out| in the standard
+// format for Dilithium public keys. It returns one on success or zero on
+// allocation error.
+OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_marshal_public_key(
+    CBB *out, const struct DILITHIUM_public_key *public_key);
+
+// DILITHIUM_parse_public_key parses a public key, in the format generated by
+// |DILITHIUM_marshal_public_key|, from |in| and writes the result to
+// |out_public_key|. It returns one on success or zero on parse error or if
+// there are trailing bytes in |in|.
+OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_parse_public_key(
+    struct DILITHIUM_public_key *public_key, CBS *in);
+
+// DILITHIUM_marshal_private_key serializes |private_key| to |out| in the
+// standard format for Dilithium private keys. It returns one on success or zero
+// on allocation error.
+OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_marshal_private_key(
+    CBB *out, const struct DILITHIUM_private_key *private_key);
+
+// DILITHIUM_parse_private_key parses a private key, in the format generated by
+// |DILITHIUM_marshal_private_key|, from |in| and writes the result to
+// |out_private_key|. It returns one on success or zero on parse error or if
+// there are trailing bytes in |in|.
+OPENSSL_EXPORT OPENSSL_DEPRECATED int DILITHIUM_parse_private_key(
+    struct DILITHIUM_private_key *private_key, CBS *in);
+
+#endif  // OPENSSL_UNSTABLE_EXPERIMENTAL_DILITHIUM
+
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
+
+#endif  // OPENSSL_HEADER_DILITHIUM_H
diff --git a/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_kyber.h b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_kyber.h
index aba5f4b5..b9c969f9 100644
--- a/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_kyber.h
+++ b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_kyber.h
@@ -22,6 +22,14 @@ extern "C" {
 #endif
 
 
+#if defined(OPENSSL_UNSTABLE_EXPERIMENTAL_KYBER)
+// This header implements experimental, draft versions of not-yet-standardized
+// primitives. When the standard is complete, these functions will be removed
+// and replaced with the final, incompatible standard version. They are
+// available now for short-lived experiments, but must not be deployed anywhere
+// durable, such as a long-lived key store. To use these functions define
+// OPENSSL_UNSTABLE_EXPERIMENTAL_KYBER
+
 // Kyber768.
 //
 // This implements the round-3 specification of Kyber, defined at
@@ -128,6 +136,8 @@ OPENSSL_EXPORT int KYBER_marshal_private_key(
 OPENSSL_EXPORT int KYBER_parse_private_key(
     struct KYBER_private_key *out_private_key, CBS *in);
 
+#endif // OPENSSL_UNSTABLE_EXPERIMENTAL_KYBER
+
 
 #if defined(__cplusplus)
 }  // extern C
diff --git a/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_spx.h b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_spx.h
index 4ba38833..798831c2 100644
--- a/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_spx.h
+++ b/Sources/CCryptoBoringSSL/include/experimental/CCryptoBoringSSL_spx.h
@@ -22,6 +22,14 @@ extern "C" {
 #endif
 
 
+#if defined(OPENSSL_UNSTABLE_EXPERIMENTAL_SPX)
+// This header implements experimental, draft versions of not-yet-standardized
+// primitives. When the standard is complete, these functions will be removed
+// and replaced with the final, incompatible standard version. They are
+// available now for short-lived experiments, but must not be deployed anywhere
+// durable, such as a long-lived key store. To use these functions define
+// OPENSSL_UNSTABLE_EXPERIMENTAL_SPX
+
 // SPX_N is the number of bytes in the hash output
 #define SPX_N 16
 
@@ -37,39 +45,42 @@ extern "C" {
 // SPHINCS+-SHA2-128s
 #define SPX_SIGNATURE_BYTES 7856
 
-// spx_generate_key generates a SPHINCS+-SHA2-128s key pair and writes the
+// SPX_generate_key generates a SPHINCS+-SHA2-128s key pair and writes the
 // result to |out_public_key| and |out_secret_key|.
 // Private key: SK.seed || SK.prf || PK.seed || PK.root
 // Public key: PK.seed || PK.root
-OPENSSL_EXPORT void spx_generate_key(
+OPENSSL_EXPORT void SPX_generate_key(
     uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES],
     uint8_t out_secret_key[SPX_SECRET_KEY_BYTES]);
 
-// spx_generate_key_from_seed generates a SPHINCS+-SHA2-128s key pair from a
+// SPX_generate_key_from_seed generates a SPHINCS+-SHA2-128s key pair from a
 // 48-byte seed and writes the result to |out_public_key| and |out_secret_key|.
 // Secret key: SK.seed || SK.prf || PK.seed || PK.root
 // Public key: PK.seed || PK.root
-OPENSSL_EXPORT void spx_generate_key_from_seed(
+OPENSSL_EXPORT void SPX_generate_key_from_seed(
     uint8_t out_public_key[SPX_PUBLIC_KEY_BYTES],
     uint8_t out_secret_key[SPX_SECRET_KEY_BYTES],
     const uint8_t seed[3 * SPX_N]);
 
-// spx_sign generates a SPHINCS+-SHA2-128s signature over |msg| or length
+// SPX_sign generates a SPHINCS+-SHA2-128s signature over |msg| or length
 // |msg_len| using |secret_key| and writes the output to |out_signature|.
 //
 // if |randomized| is 0, deterministic signing is performed, otherwise,
 // non-deterministic signing is performed.
-OPENSSL_EXPORT void spx_sign(uint8_t out_snignature[SPX_SIGNATURE_BYTES],
-                             const uint8_t secret_key[SPX_SECRET_KEY_BYTES],
-                             const uint8_t *msg, size_t msg_len,
-                             int randomized);
+OPENSSL_EXPORT void SPX_sign(
+    uint8_t out_snignature[SPX_SIGNATURE_BYTES],
+    const uint8_t secret_key[SPX_SECRET_KEY_BYTES], const uint8_t *msg,
+    size_t msg_len, int randomized);
 
-// spx_verify verifies a SPHINCS+-SHA2-128s signature in |signature| over |msg|
+// SPX_verify verifies a SPHINCS+-SHA2-128s signature in |signature| over |msg|
 // or length |msg_len| using |public_key|. 1 is returned if the signature
 // matches, 0 otherwise.
-OPENSSL_EXPORT int spx_verify(const uint8_t signature[SPX_SIGNATURE_BYTES],
-                              const uint8_t public_key[SPX_SECRET_KEY_BYTES],
-                              const uint8_t *msg, size_t msg_len);
+OPENSSL_EXPORT int SPX_verify(
+    const uint8_t signature[SPX_SIGNATURE_BYTES],
+    const uint8_t public_key[SPX_SECRET_KEY_BYTES], const uint8_t *msg,
+    size_t msg_len);
+
+#endif //OPENSSL_UNSTABLE_EXPERIMENTAL_SPX
 
 
 #if defined(__cplusplus)
diff --git a/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_mul.S b/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_mul.S
index 4fa322fb..a8b18dfb 100644
--- a/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_mul.S
+++ b/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_mul.S
@@ -4,7 +4,6 @@
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
     (defined(__APPLE__) || defined(__ELF__))
 
-.intel_syntax noprefix
 .text
 #if defined(__APPLE__)
 .private_extern _fiat_p256_adx_mul
@@ -19,158 +18,158 @@ fiat_p256_adx_mul:
 
 .cfi_startproc
 _CET_ENDBR
-push rbp
-.cfi_adjust_cfa_offset 8
+pushq %rbp
+;.cfi_adjust_cfa_offset 8
 .cfi_offset rbp, -16
-mov rbp, rsp
-mov rax, rdx
-mov rdx, [ rsi + 0x0 ]
-test al, al
-mulx r8, rcx, [ rax + 0x0 ]
-mov [ rsp - 0x80 ], rbx
+movq %rsp, %rbp
+movq %rdx, %rax
+movq (%rsi), %rdx
+testb %al, %al
+mulxq (%rax), %rcx, %r8
+movq %rbx, -0x80(%rsp)
 .cfi_offset rbx, -16-0x80
-mulx rbx, r9, [ rax + 0x8 ]
-mov [ rsp - 0x68 ], r14
+mulxq 0x8(%rax), %r9, %rbx
+movq %r14, -0x68(%rsp)
 .cfi_offset r14, -16-0x68
-adc r9, r8
-mov [ rsp - 0x60 ], r15
+adcq %r8, %r9
+movq %r15, -0x60(%rsp)
 .cfi_offset r15, -16-0x60
-mulx r15, r14, [ rax + 0x10 ]
-mov [ rsp - 0x78 ], r12
+mulxq 0x10(%rax), %r14, %r15
+movq %r12, -0x78(%rsp)
 .cfi_offset r12, -16-0x78
-adc r14, rbx
-mulx r11, r10, [ rax + 0x18 ]
-mov [ rsp - 0x70 ], r13
+adcq %rbx, %r14
+mulxq 0x18(%rax), %r10, %r11
+movq %r13, -0x70(%rsp)
 .cfi_offset r13, -16-0x70
-adc r10, r15
-mov rdx, [ rsi + 0x8 ]
-mulx rbx, r8, [ rax + 0x0 ]
-adc r11, 0x0
-xor r15, r15
-adcx r8, r9
-adox rbx, r14
-mov [ rsp - 0x58 ], rdi
-mulx rdi, r9, [ rax + 0x8 ]
-adcx r9, rbx
-adox rdi, r10
-mulx rbx, r14, [ rax + 0x10 ]
-adcx r14, rdi
-adox rbx, r11
-mulx r13, r12, [ rax + 0x18 ]
-adcx r12, rbx
-mov rdx, 0x100000000
-mulx r11, r10, rcx
-adox r13, r15
-adcx r13, r15
-xor rdi, rdi
-adox r10, r8
-mulx r8, rbx, r10
-adox r11, r9
-adcx rbx, r11
-adox r8, r14
-mov rdx, 0xffffffff00000001
-mulx r9, r15, rcx
-adcx r15, r8
-adox r9, r12
-mulx r14, rcx, r10
-mov rdx, [ rsi + 0x10 ]
-mulx r10, r12, [ rax + 0x8 ]
-adcx rcx, r9
-adox r14, r13
-mulx r11, r13, [ rax + 0x0 ]
-mov r9, rdi
-adcx r14, r9
-adox rdi, rdi
-adc rdi, 0x0
-xor r9, r9
-adcx r13, rbx
-adox r11, r15
-mov rdx, [ rsi + 0x10 ]
-mulx r15, r8, [ rax + 0x10 ]
-adox r10, rcx
-mulx rcx, rbx, [ rax + 0x18 ]
-mov rdx, [ rsi + 0x18 ]
-adcx r12, r11
-mulx rsi, r11, [ rax + 0x8 ]
-adcx r8, r10
-adox r15, r14
-adcx rbx, r15
-adox rcx, r9
-adcx rcx, r9
-mulx r15, r10, [ rax + 0x0 ]
-add rcx, rdi
-mov r14, r9
-adc r14, 0
-xor r9, r9
-adcx r10, r12
-adox r15, r8
-adcx r11, r15
-adox rsi, rbx
-mulx r8, r12, [ rax + 0x10 ]
-adox r8, rcx
-mulx rcx, rbx, [ rax + 0x18 ]
-adcx r12, rsi
-adox rcx, r9
-mov rdx, 0x100000000
-adcx rbx, r8
-adc rcx, 0
-mulx rdi, r15, r13
-xor rax, rax
-adcx rcx, r14
-adc rax, 0
-xor r9, r9
-adox r15, r10
-mulx r14, r10, r15
-adox rdi, r11
-mov rdx, 0xffffffff00000001
-adox r14, r12
-adcx r10, rdi
-mulx r12, r11, r13
-adcx r11, r14
-adox r12, rbx
-mulx rbx, r13, r15
-adcx r13, r12
-adox rbx, rcx
-mov r8, r9
-adox rax, r9
-adcx r8, rbx
-adc rax, 0x0
-mov rcx, rax
-mov r15, 0xffffffffffffffff
-mov rdi, r10
-sub rdi, r15
-mov r14, 0xffffffff
-mov r12, r11
-sbb r12, r14
-mov rbx, r13
-sbb rbx, r9
-mov rax, rax
-mov rax, r8
-sbb rax, rdx
-sbb rcx, r9
-cmovc rdi, r10
-mov r10, [ rsp - 0x58 ]
-cmovc rbx, r13
-mov r13, [ rsp - 0x70 ]
+adcq %r15, %r10
+movq 0x8(%rsi), %rdx
+mulxq (%rax), %r8, %rbx
+adcq $0x0, %r11
+xorq %r15, %r15
+adcxq %r9, %r8
+adoxq %r14, %rbx
+movq %rdi, -0x58(%rsp)
+mulxq 0x8(%rax), %r9, %rdi
+adcxq %rbx, %r9
+adoxq %r10, %rdi
+mulxq 0x10(%rax), %r14, %rbx
+adcxq %rdi, %r14
+adoxq %r11, %rbx
+mulxq 0x18(%rax), %r12, %r13
+adcxq %rbx, %r12
+movq $0x100000000, %rdx
+mulxq %rcx, %r10, %r11
+adoxq %r15, %r13
+adcxq %r15, %r13
+xorq %rdi, %rdi
+adoxq %r8, %r10
+mulxq %r10, %rbx, %r8
+adoxq %r9, %r11
+adcxq %r11, %rbx
+adoxq %r14, %r8
+movq $0xffffffff00000001, %rdx
+mulxq %rcx, %r15, %r9
+adcxq %r8, %r15
+adoxq %r12, %r9
+mulxq %r10, %rcx, %r14
+movq 0x10(%rsi), %rdx
+mulxq 0x8(%rax), %r12, %r10
+adcxq %r9, %rcx
+adoxq %r13, %r14
+mulxq (%rax), %r13, %r11
+movq %rdi, %r9
+adcxq %r9, %r14
+adoxq %rdi, %rdi
+adcq $0x0, %rdi
+xorq %r9, %r9
+adcxq %rbx, %r13
+adoxq %r15, %r11
+movq 0x10(%rsi), %rdx
+mulxq 0x10(%rax), %r8, %r15
+adoxq %rcx, %r10
+mulxq 0x18(%rax), %rbx, %rcx
+movq 0x18(%rsi), %rdx
+adcxq %r11, %r12
+mulxq 0x8(%rax), %r11, %rsi
+adcxq %r10, %r8
+adoxq %r14, %r15
+adcxq %r15, %rbx
+adoxq %r9, %rcx
+adcxq %r9, %rcx
+mulxq (%rax), %r10, %r15
+addq %rdi, %rcx
+movq %r9, %r14
+adcq $0x0, %r14
+xorq %r9, %r9
+adcxq %r12, %r10
+adoxq %r8, %r15
+adcxq %r15, %r11
+adoxq %rbx, %rsi
+mulxq 0x10(%rax), %r12, %r8
+adoxq %rcx, %r8
+mulxq 0x18(%rax), %rbx, %rcx
+adcxq %rsi, %r12
+adoxq %r9, %rcx
+movq $0x100000000, %rdx
+adcxq %r8, %rbx
+adcq $0x0, %rcx
+mulxq %r13, %r15, %rdi
+xorq %rax, %rax
+adcxq %r14, %rcx
+adcq $0x0, %rax
+xorq %r9, %r9
+adoxq %r10, %r15
+mulxq %r15, %r10, %r14
+adoxq %r11, %rdi
+movq $0xffffffff00000001, %rdx
+adoxq %r12, %r14
+adcxq %rdi, %r10
+mulxq %r13, %r11, %r12
+adcxq %r14, %r11
+adoxq %rbx, %r12
+mulxq %r15, %r13, %rbx
+adcxq %r12, %r13
+adoxq %rcx, %rbx
+movq %r9, %r8
+adoxq %r9, %rax
+adcxq %rbx, %r8
+adcq $0x0, %rax
+movq %rax, %rcx
+movq $0xffffffffffffffff, %r15
+movq %r10, %rdi
+subq %r15, %rdi
+movq $0xffffffff, %r14
+movq %r11, %r12
+sbbq %r14, %r12
+movq %r13, %rbx
+sbbq %r9, %rbx
+movq %rax, %rax
+movq %r8, %rax
+sbbq %rdx, %rax
+sbbq %r9, %rcx
+cmovcq %r10, %rdi
+movq -0x58(%rsp), %r10
+cmovcq %r13, %rbx
+movq -0x70(%rsp), %r13
 .cfi_restore r13
-cmovc r12, r11
-cmovc rax, r8
-mov [ r10 + 0x10 ], rbx
-mov rbx, [ rsp - 0x80 ]
+cmovcq %r11, %r12
+cmovcq %r8, %rax
+movq %rbx, 0x10(%r10)
+movq -0x80(%rsp), %rbx
 .cfi_restore rbx
-mov [ r10 + 0x0 ], rdi
-mov [ r10 + 0x8 ], r12
-mov [ r10 + 0x18 ], rax
-mov r12, [ rsp - 0x78 ]
+movq %rdi, (%r10)
+movq %r12, 0x8(%r10)
+movq %rax, 0x18(%r10)
+movq -0x78(%rsp), %r12
 .cfi_restore r12
-mov r14, [ rsp - 0x68 ]
+movq -0x68(%rsp), %r14
 .cfi_restore r14
-mov r15, [ rsp - 0x60 ]
+movq -0x60(%rsp), %r15
 .cfi_restore r15
-pop rbp
+popq %rbp
 .cfi_restore rbp
 .cfi_adjust_cfa_offset -8
-ret
+retq
 .cfi_endproc
 #if defined(__ELF__)
 .size fiat_p256_adx_mul, .-fiat_p256_adx_mul
diff --git a/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_sqr.S b/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_sqr.S
index ee6da27a..3ee05295 100644
--- a/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_sqr.S
+++ b/Sources/CCryptoBoringSSL/third_party/fiat/asm/fiat_p256_adx_sqr.S
@@ -4,7 +4,6 @@
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
     (defined(__APPLE__) || defined(__ELF__))
 
-.intel_syntax noprefix
 .text
 #if defined(__APPLE__)
 .private_extern _fiat_p256_adx_sqr
@@ -19,147 +18,147 @@ fiat_p256_adx_sqr:
 
 .cfi_startproc
 _CET_ENDBR
-push rbp
+pushq %rbp
 .cfi_adjust_cfa_offset 8
 .cfi_offset rbp, -16
-mov rbp, rsp
-mov rdx, [ rsi + 0x0 ]
-mulx r10, rax, [ rsi + 0x18 ]
-mulx rcx, r11, rdx
-mulx r9, r8, [ rsi + 0x8 ]
-mov [ rsp - 0x80 ], rbx
+movq %rsp, %rbp
+movq (%rsi), %rdx
+mulxq 0x18(%rsi), %rax, %r10
+mulxq %rdx, %r11, %rcx
+mulxq 0x8(%rsi), %r8, %r9
+movq %rbx, -0x80(%rsp)
 .cfi_offset rbx, -16-0x80
-xor rbx, rbx
-adox r8, r8
-mov [ rsp - 0x78 ], r12
+xorq %rbx, %rbx
+adoxq %r8, %r8
+movq %r12, -0x78(%rsp)
 .cfi_offset r12, -16-0x78
-mulx r12, rbx, [ rsi + 0x10 ]
-mov rdx, [ rsi + 0x8 ]
-mov [ rsp - 0x70 ], r13
+mulxq 0x10(%rsi), %rbx, %r12
+movq 0x8(%rsi), %rdx
+movq %r13, -0x70(%rsp)
 .cfi_offset r13, -16-0x70
-mov [ rsp - 0x68 ], r14
+movq %r14, -0x68(%rsp)
 .cfi_offset r14, -16-0x68
-mulx r14, r13, rdx
-mov [ rsp - 0x60 ], r15
+mulxq %rdx, %r13, %r14
+movq %r15, -0x60(%rsp)
 .cfi_offset r15, -16-0x60
-mov [ rsp - 0x58 ], rdi
-mulx rdi, r15, [ rsi + 0x10 ]
-adcx r12, r15
-mov [ rsp - 0x50 ], r11
-mulx r11, r15, [ rsi + 0x18 ]
-adcx r10, rdi
-mov rdi, 0x0
-adcx r11, rdi
+movq %rdi, -0x58(%rsp)
+mulxq 0x10(%rsi), %r15, %rdi
+adcxq %r15, %r12
+movq %r11, -0x50(%rsp)
+mulxq 0x18(%rsi), %r15, %r11
+adcxq %rdi, %r10
+movq $0x0, %rdi
+adcxq %rdi, %r11
 clc
-adcx rbx, r9
-adox rbx, rbx
-adcx rax, r12
-adox rax, rax
-adcx r15, r10
-adox r15, r15
-mov rdx, [ rsi + 0x10 ]
-mulx r12, r9, [ rsi + 0x18 ]
-adcx r9, r11
-adcx r12, rdi
-mulx r11, r10, rdx
+adcxq %r9, %rbx
+adoxq %rbx, %rbx
+adcxq %r12, %rax
+adoxq %rax, %rax
+adcxq %r10, %r15
+adoxq %r15, %r15
+movq 0x10(%rsi), %rdx
+mulxq 0x18(%rsi), %r9, %r12
+adcxq %r11, %r9
+adcxq %rdi, %r12
+mulxq %rdx, %r10, %r11
 clc
-adcx rcx, r8
-adcx r13, rbx
-adcx r14, rax
-adox r9, r9
-adcx r10, r15
-mov rdx, [ rsi + 0x18 ]
-mulx rbx, r8, rdx
-adox r12, r12
-adcx r11, r9
-mov rsi, [ rsp - 0x50 ]
-adcx r8, r12
-mov rax, 0x100000000
-mov rdx, rax
-mulx r15, rax, rsi
-adcx rbx, rdi
-adox rbx, rdi
-xor r9, r9
-adox rax, rcx
-adox r15, r13
-mulx rcx, rdi, rax
-adcx rdi, r15
-adox rcx, r14
-mov rdx, 0xffffffff00000001
-mulx r14, r13, rsi
-adox r14, r10
-adcx r13, rcx
-mulx r12, r10, rax
-adox r12, r11
-mov r11, r9
-adox r11, r8
-adcx r10, r14
-mov r8, r9
-adcx r8, r12
-mov rax, r9
-adcx rax, r11
-mov r15, r9
-adox r15, rbx
-mov rdx, 0x100000000
-mulx rcx, rbx, rdi
-mov r14, r9
-adcx r14, r15
-mov r12, r9
-adox r12, r12
-adcx r12, r9
-adox rbx, r13
-mulx r11, r13, rbx
-mov r15, 0xffffffff00000001
-mov rdx, r15
-mulx rsi, r15, rbx
-adox rcx, r10
-adox r11, r8
-mulx r8, r10, rdi
-adcx r13, rcx
-adox r8, rax
-adcx r10, r11
-adox rsi, r14
-mov rdi, r12
-mov rax, r9
-adox rdi, rax
-adcx r15, r8
-mov r14, rax
-adcx r14, rsi
-adcx rdi, r9
-dec r9
-mov rbx, r13
-sub rbx, r9
-mov rcx, 0xffffffff
-mov r11, r10
-sbb r11, rcx
-mov r8, r15
-sbb r8, rax
-mov rsi, r14
-sbb rsi, rdx
-sbb rdi, rax
-cmovc rbx, r13
-cmovc r8, r15
-cmovc r11, r10
-cmovc rsi, r14
-mov rdi, [ rsp - 0x58 ]
-mov [ rdi + 0x18 ], rsi
-mov [ rdi + 0x0 ], rbx
-mov [ rdi + 0x8 ], r11
-mov [ rdi + 0x10 ], r8
-mov rbx, [ rsp - 0x80 ]
+adcxq %r8, %rcx
+adcxq %rbx, %r13
+adcxq %rax, %r14
+adoxq %r9, %r9
+adcxq %r15, %r10
+movq 0x18(%rsi), %rdx
+mulxq %rdx, %r8, %rbx
+adoxq %r12, %r12
+adcxq %r9, %r11
+movq -0x50(%rsp), %rsi
+adcxq %r12, %r8
+movq $0x100000000, %rax
+movq %rax, %rdx
+mulxq %rsi, %rax, %r15
+adcxq %rdi, %rbx
+adoxq %rdi, %rbx
+xorq %r9, %r9
+adoxq %rcx, %rax
+adoxq %r13, %r15
+mulxq %rax, %rdi, %rcx
+adcxq %r15, %rdi
+adoxq %r14, %rcx
+movq $0xffffffff00000001, %rdx
+mulxq %rsi, %r13, %r14
+adoxq %r10, %r14
+adcxq %rcx, %r13
+mulxq %rax, %r10, %r12
+adoxq %r11, %r12
+movq %r9, %r11
+adoxq %r8, %r11
+adcxq %r14, %r10
+movq %r9, %r8
+adcxq %r12, %r8
+movq %r9, %rax
+adcxq %r11, %rax
+movq %r9, %r15
+adoxq %rbx, %r15
+movq $0x100000000, %rdx
+mulxq %rdi, %rbx, %rcx
+movq %r9, %r14
+adcxq %r15, %r14
+movq %r9, %r12
+adoxq %r12, %r12
+adcxq %r9, %r12
+adoxq %r13, %rbx
+mulxq %rbx, %r13, %r11
+movq $0xffffffff00000001, %r15
+movq %r15, %rdx
+mulxq %rbx, %r15, %rsi
+adoxq %r10, %rcx
+adoxq %r8, %r11
+mulxq %rdi, %r10, %r8
+adcxq %rcx, %r13
+adoxq %rax, %r8
+adcxq %r11, %r10
+adoxq %r14, %rsi
+movq %r12, %rdi
+movq %r9, %rax
+adoxq %rax, %rdi
+adcxq %r8, %r15
+movq %rax, %r14
+adcxq %rsi, %r14
+adcxq %r9, %rdi
+decq %r9
+movq %r13, %rbx
+subq %r9, %rbx
+movq $0xffffffff, %rcx
+movq %r10, %r11
+sbbq %rcx, %r11
+movq %r15, %r8
+sbbq %rax, %r8
+movq %r14, %rsi
+sbbq %rdx, %rsi
+sbbq %rax, %rdi
+cmovcq %r13, %rbx
+cmovcq %r15, %r8
+cmovcq %r10, %r11
+cmovcq %r14, %rsi
+movq -0x58(%rsp), %rdi
+movq %rsi, 0x18(%rdi)
+movq %rbx, (%rdi)
+movq %r11, 0x8(%rdi)
+movq %r8, 0x10(%rdi)
+movq -0x80(%rsp), %rbx
 .cfi_restore rbx
-mov r12, [ rsp - 0x78 ]
+movq -0x78(%rsp), %r12
 .cfi_restore r12
-mov r13, [ rsp - 0x70 ]
+movq -0x70(%rsp), %r13
 .cfi_restore r13
-mov r14, [ rsp - 0x68 ]
+movq -0x68(%rsp), %r14
 .cfi_restore r14
-mov r15, [ rsp - 0x60 ]
+movq -0x60(%rsp), %r15
 .cfi_restore r15
-pop rbp
+popq %rbp
 .cfi_restore rbp
 .cfi_adjust_cfa_offset -8
-ret
+retq
 .cfi_endproc
 #if defined(__ELF__)
 .size fiat_p256_adx_sqr, .-fiat_p256_adx_sqr