diff --git a/.github/workflows/sha1.yml b/.github/workflows/sha1.yml
index ca09fd895..2eb5c4312 100644
--- a/.github/workflows/sha1.yml
+++ b/.github/workflows/sha1.yml
@@ -187,3 +187,26 @@ jobs:
           override: true
       - run: cargo test --no-default-features
       - run: cargo test
+
+  # TODO: merge with test on MSRV bump to 1.59 or higher
+  # TODO: do i need to think about no-std platforms here?
+  test-inline-asm:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        target:
+          - aarch64-unknown-linux-gnu
+          - x86_64-unknown-linux-gnu
+          - x86-unknown-linux-gnu
+          # TODO - aarch64-apple-darwin
+        rust:
+          - 1.59.0 # MSRV
+    steps:
+      - uses: actions/checkout@v3
+      - uses: RustCrypto/actions/cargo-cache@master
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: ${{ matrix.rust }}
+          override: true
+      - run: cargo test --features inline-asm
diff --git a/Cargo.lock b/Cargo.lock
index d851c17b4..66c3279ff 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,12 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "asm_block"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "466c0990cf15ef0f331f19fdc16fd60229606ab237476c70a66b747fce2911ab"
+
 [[package]]
 name = "blake2"
 version = "0.10.6"
@@ -204,6 +210,7 @@ dependencies = [
 name = "sha1"
 version = "0.10.5"
 dependencies = [
+ "asm_block",
  "cfg-if",
  "cpufeatures",
  "digest",
diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml
index 8ff801508..06a7bc44e 100644
--- a/sha1/Cargo.toml
+++ b/sha1/Cargo.toml
@@ -18,6 +18,7 @@ cfg-if = "1.0"
 [target.'cfg(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"))'.dependencies]
 cpufeatures = "0.2"
 sha1-asm = { version = "0.5", optional = true }
+asm_block = { version = "0.1.3", optional = true }
 
 [dev-dependencies]
 digest = { version = "0.10.4", features = ["dev"] }
@@ -30,6 +31,7 @@ oid = ["digest/oid"] # Enable OID support. WARNING: Bumps MSRV to 1.57
 asm = ["sha1-asm"] # WARNING: this feature SHOULD NOT be enabled by library crates
 compress = [] # Expose compress function
 force-soft = [] # Force software implementation
+inline-asm = ["asm_block"] # TODO: i don't know why the "do not enable by library crates" warning is in the asm feature, flagging this to ask about it. # WARNING: bumps MSRV to 1.59
 
 [package.metadata.docs.rs]
 all-features = true
diff --git a/sha1/README.md b/sha1/README.md
index bd76f0973..d388cc6f1 100644
--- a/sha1/README.md
+++ b/sha1/README.md
@@ -23,6 +23,8 @@ We provide this crate for legacy interoperability purposes only.
 
 Rust **1.41** or higher.
 
+Enabling feature flag `inline-asm` requires Rust **1.59** or higher.
+
 Minimum supported Rust version can be changed in the future, but it will be
 done with a minor version bump.
 
diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs
new file mode 100644
index 000000000..c5fb97351
--- /dev/null
+++ b/sha1/src/asm/aarch64.rs
@@ -0,0 +1,279 @@
+//! SHA-1 hash in AArch64 assembly, adapted from Emmanuel Gil Peyrot's MIT-licensed implementation
+//
+// /*
+//  * SHA-1 hash in AArch64 assembly
+//  *
+//  * Copyright (c) 2020 Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>. (MIT License)
+//  *
+//  * Permission is hereby granted, free of charge, to any person obtaining a copy of
+//  * this software and associated documentation files (the "Software"), to deal in
+//  * the Software without restriction, including without limitation the rights to
+//  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+//  * the Software, and to permit persons to whom the Software is furnished to do so,
+//  * subject to the following conditions:
+//  * - The above copyright notice and this permission notice shall be included in
+//  *   all copies or substantial portions of the Software.
+//  * - The Software is provided "as is", without warranty of any kind, express or
+//  *   implied, including but not limited to the warranties of merchantability,
+//  *   fitness for a particular purpose and noninfringement. In no event shall the
+//  *   authors or copyright holders be liable for any claim, damages or other
+//  *   liability, whether in an action of contract, tort or otherwise, arising from,
+//  *   out of or in connection with the Software or the use or other dealings in the
+//  *   Software.
+//  */
+use core::arch::asm;
+
+// macro_rules! sha_1_through_4 {
+//     (F, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => {
+
+/// SHA1 compress function. We don't have enough registers to load the whole block,
+/// so we need to use memory address to refer to the inputs. Due to possible failure
+/// of register allocation on `x86`, we explicitly specify registers to use.
+#[cfg(all(feature = "inline-asm", target_arch = "aarch64"))]
+pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
+    let mut out_state = [0u32; 5];
+    // SAFETY: inline-assembly
+    unsafe {
+        asm!(
+            // from original code, some docs :)
+            // 	/*
+            // 	 * Storage usage:
+            // 	 *   Bytes  Location  Description
+            // 	 *       4  x0        state argument
+            // 	 *       4  x1        block argument
+            // 	 *      16  q0        W0
+            // 	 *      16  q1        W1
+            // 	 *      16  q2        W2
+            // 	 *      16  q3        W3
+            // 	 *      16  q4        k
+            // 	 *      16  q5        Original ABCD
+            // 	 *      16  q6        ABCD (with s3 being A)
+            // 	 *       4  s16       E
+            // 	 *       4  s17       e0
+            // 	 *       4  s18       e1
+            // 	 *      16  q19       wk
+            // 	 */
+
+            // Load state in registers
+            // original code:
+            // 	ldr	q5, [x0]
+            // 	ldr	s16, [x0, 16]
+            // this now happens at the bottom...
+            // TODO what is this doing?
+            // i believe it's copying state[0..4] into v6 (which is also q6)
+            // confirmed this is the mutable copy of the first 4 words of the state
+            "mov v6.16b, v5.16b",
+
+            // Load block in registers
+            // original code:
+            // 	ldr	q0, [x1]
+            // 	ldr	q1, [x1, 16]
+            // 	ldr	q2, [x1, 32]
+            // 	ldr	q3, [x1, 48]
+            // this is at the bottom now
+
+            // from original code: TODO: only do that on little endian
+            // this flips the blocks from little to big endian
+            "rev32 v0.16b, v0.16b",
+            "rev32 v1.16b, v1.16b",
+            "rev32 v2.16b, v2.16b",
+            "rev32 v3.16b, v3.16b",
+
+            // k for the next five rounds
+            "adrp x1, .K0",
+            "ldr	q4, [x1, #:lo12:.K0]",
+
+            // 0
+            "sha1h	s18, s6",
+            "add	v19.4s, v0.4s, v4.4s",
+            "sha1c	q6, s16, v19.4s",
+            "sha1su0	v0.4s, v1.4s, v2.4s",
+
+            // 1
+            "sha1h	s17, s6",
+            "add	v19.4s, v1.4s, v4.4s",
+            "sha1c	q6, s18, v19.4s",
+            "sha1su1	v0.4s, v3.4s",
+            "sha1su0	v1.4s, v2.4s, v3.4s",
+
+            // 2
+            "sha1h	s18, s6",
+            "add	v19.4s, v2.4s, v4.4s",
+            "sha1c	q6, s17, v19.4s",
+            "sha1su1	v1.4s, v0.4s",
+            "sha1su0	v2.4s, v3.4s, v0.4s",
+
+            // 3
+            "sha1h	s17, s6",
+            "add	v19.4s, v3.4s, v4.4s",
+            "sha1c	q6, s18, v19.4s",
+            "sha1su1	v2.4s, v1.4s",
+            "sha1su0	v3.4s, v0.4s, v1.4s",
+
+            // 4
+            "sha1h	s18, s6",
+            "add	v19.4s, v0.4s, v4.4s",
+            "sha1c	q6, s17, v19.4s",
+            "sha1su1	v3.4s, v2.4s",
+            "sha1su0	v0.4s, v1.4s, v2.4s",
+
+            // k for the next five rounds
+            "adrp	x1, .K1",
+            "ldr	q4, [x1, #:lo12:.K1]",
+
+            // 5
+            "sha1h	s17, s6",
+            "add	v19.4s, v1.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+            "sha1su1	v0.4s, v3.4s",
+            "sha1su0	v1.4s, v2.4s, v3.4s",
+
+            // 6
+            "sha1h	s18, s6",
+            "add	v19.4s, v2.4s, v4.4s",
+            "sha1p	q6, s17, v19.4s",
+            "sha1su1	v1.4s, v0.4s",
+            "sha1su0	v2.4s, v3.4s, v0.4s",
+
+            // 7
+            "sha1h	s17, s6",
+            "add	v19.4s, v3.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+            "sha1su1	v2.4s, v1.4s",
+            "sha1su0	v3.4s, v0.4s, v1.4s",
+
+            // 8
+            "sha1h	s18, s6",
+            "add	v19.4s, v0.4s, v4.4s",
+            "sha1p	q6, s17, v19.4s",
+            "sha1su1	v3.4s, v2.4s",
+            "sha1su0	v0.4s, v1.4s, v2.4s",
+
+            // 9
+            "sha1h	s17, s6",
+            "add	v19.4s, v1.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+            "sha1su1	v0.4s, v3.4s",
+            "sha1su0	v1.4s, v2.4s, v3.4s",
+
+            // k for the next five rounds
+            "adrp	x1, .K2",
+            "ldr	q4, [x1, #:lo12:.K2]",
+
+            // 10
+            "sha1h	s18, s6",
+            "add	v19.4s, v2.4s, v4.4s",
+            "sha1m	q6, s17, v19.4s",
+            "sha1su1	v1.4s, v0.4s",
+            "sha1su0	v2.4s, v3.4s, v0.4s",
+
+            // 11
+            "sha1h	s17, s6",
+            "add	v19.4s, v3.4s, v4.4s",
+            "sha1m	q6, s18, v19.4s",
+            "sha1su1	v2.4s, v1.4s",
+            "sha1su0	v3.4s, v0.4s, v1.4s",
+
+            // 12
+            "sha1h	s18, s6",
+            "add	v19.4s, v0.4s, v4.4s",
+            "sha1m	q6, s17, v19.4s",
+            "sha1su1	v3.4s, v2.4s",
+            "sha1su0	v0.4s, v1.4s, v2.4s",
+
+            // 13
+            "sha1h	s17, s6",
+            "add	v19.4s, v1.4s, v4.4s",
+            "sha1m	q6, s18, v19.4s",
+            "sha1su1	v0.4s, v3.4s",
+            "sha1su0	v1.4s, v2.4s, v3.4s",
+
+            // 14
+            "sha1h	s18, s6",
+            "add	v19.4s, v2.4s, v4.4s",
+            "sha1m	q6, s17, v19.4s",
+            "sha1su1	v1.4s, v0.4s",
+            "sha1su0	v2.4s, v3.4s, v0.4s",
+
+            // k for the next five rounds
+            "adrp	x1, .K3",
+            "ldr	q4, [x1, #:lo12:.K3]",
+
+            // 15
+            "sha1h	s17, s6",
+            "add	v19.4s, v3.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+            "sha1su1	v2.4s, v1.4s",
+            "sha1su0	v3.4s, v0.4s, v1.4s",
+
+            // 16
+            "sha1h	s18, s6",
+            "add	v19.4s, v0.4s, v4.4s",
+            "sha1p	q6, s17, v19.4s",
+            "sha1su1	v3.4s, v2.4s",
+
+            // 17
+            "sha1h	s17, s6",
+            "add	v19.4s, v1.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+
+            // 18
+            "sha1h	s18, s6",
+            "add	v19.4s, v2.4s, v4.4s",
+            "sha1p	q6, s17, v19.4s",
+
+            // 19
+            "sha1h	s17, s6",
+            "add	v19.4s, v3.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+
+            // Update state
+            "add	v6.4s, v6.4s, v5.4s",
+            // source code: str	q6, [x0]
+            // this now happens at the bottom
+            "add	v16.2s, v16.2s, v17.2s",
+            // source code: str	s16, [x0, 16]
+            // this now happens at the bottom
+
+            "ret", // TODO is this right
+
+            ".align 4", // TODO ummm alignment...
+            ".K0:", // TODO are labels just the same in inline asm in rust?
+            ".word	0x5A827999",
+            ".word	0x5A827999",
+            ".word	0x5A827999",
+            ".word	0x5A827999",
+            ".K1:",
+            ".word	0x6ED9EBA1",
+            ".word	0x6ED9EBA1",
+            ".word	0x6ED9EBA1",
+            ".word	0x6ED9EBA1",
+            ".K2:",
+            ".word	0x8F1BBCDC",
+            ".word	0x8F1BBCDC",
+            ".word	0x8F1BBCDC",
+            ".word	0x8F1BBCDC",
+            ".K3:",
+            ".word	0xCA62C1D6",
+            ".word	0xCA62C1D6",
+            ".word	0xCA62C1D6",
+            ".word	0xCA62C1D6",
+
+            // state ins and outs
+            in("q4") state.as_mut_ptr(),
+            inout("s16") state[4],
+            lateout("q6") state as *mut u32,
+            // blocks in
+            in("q0") blocks[0][0..16].as_ptr(),
+            in("q1") blocks[0][16..32].as_ptr(),
+            in("q2") blocks[0][32..48].as_ptr(),
+            in("q3") blocks[0][48..64].as_ptr(),
+            // some clobbers
+            out("q5") _,
+            out("s17") _,
+            out("s18") _,
+            out("q19") _,
+        // TODO make sure there aren't any other clobbers
+        );
+    };
+}
diff --git a/sha1/src/asm/aarch64_apple.rs b/sha1/src/asm/aarch64_apple.rs
new file mode 100644
index 000000000..832c2c359
--- /dev/null
+++ b/sha1/src/asm/aarch64_apple.rs
@@ -0,0 +1,237 @@
+// /*
+//  * SHA-1 hash in AArch64 assembly
+//  *
+//  * Copyright (c) 2020 Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>. (MIT License)
+//  *
+//  * Permission is hereby granted, free of charge, to any person obtaining a copy of
+//  * this software and associated documentation files (the "Software"), to deal in
+//  * the Software without restriction, including without limitation the rights to
+//  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+//  * the Software, and to permit persons to whom the Software is furnished to do so,
+//  * subject to the following conditions:
+//  * - The above copyright notice and this permission notice shall be included in
+//  *   all copies or substantial portions of the Software.
+//  * - The Software is provided "as is", without warranty of any kind, express or
+//  *   implied, including but not limited to the warranties of merchantability,
+//  *   fitness for a particular purpose and noninfringement. In no event shall the
+//  *   authors or copyright holders be liable for any claim, damages or other
+//  *   liability, whether in an action of contract, tort or otherwise, arising from,
+//  *   out of or in connection with the Software or the use or other dealings in the
+//  *   Software.
+//  */
+//
+//
+// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */
+// .global _sha1_compress
+// _sha1_compress:
+// 	/*
+// 	 * Storage usage:
+// 	 *   Bytes  Location  Description
+// 	 *       4  x0        state argument
+// 	 *       4  x1        block argument
+// 	 *      16  q0        W0
+// 	 *      16  q1        W1
+// 	 *      16  q2        W2
+// 	 *      16  q3        W3
+// 	 *      16  q4        k
+// 	 *      16  q5        Original ABCD
+// 	 *      16  q6        ABCD (with s3 being A)
+// 	 *       4  s16       E
+// 	 *       4  s17       e0
+// 	 *       4  s18       e1
+// 	 *      16  q19       wk
+// 	 */
+//
+// 	// Load state in registers
+// 	ldr	q5, [x0]
+// 	ldr	s16, [x0, 16]
+// 	mov	v6.16b, v5.16b
+//
+// 	// Load block in registers
+// 	ldr	q0, [x1]
+// 	ldr	q1, [x1, 16]
+// 	ldr	q2, [x1, 32]
+// 	ldr	q3, [x1, 48]
+//
+// 	// TODO: only do that on little endian
+// 	rev32	v0.16b, v0.16b
+// 	rev32	v1.16b, v1.16b
+// 	rev32	v2.16b, v2.16b
+// 	rev32	v3.16b, v3.16b
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K0@PAGE
+// 	ldr	q4, [x1, #:lo12:.K0@PAGEOFF]
+//
+// 	// 0
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1c	q6, s16, v19.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// 1
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1c	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// 2
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1c	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// 3
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1c	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 4
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1c	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K1@PAGE
+// 	ldr	q4, [x1, #:lo12:.K1@PAGEOFF]
+//
+// 	// 5
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// 6
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// 7
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 8
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// 9
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K2@PAGE
+// 	ldr	q4, [x1, #:lo12:.K2@PAGEOFF]
+//
+// 	// 10
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1m	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// 11
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1m	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 12
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1m	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// 13
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1m	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// 14
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1m	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K3@PAGE
+// 	ldr	q4, [x1, #:lo12:.K3@PAGEOFF]
+//
+// 	// 15
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 16
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+//
+// 	// 17
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+//
+// 	// 18
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+//
+// 	// 19
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+//
+// 	// Update state
+// 	add	v6.4s, v6.4s, v5.4s
+// 	str	q6, [x0]
+// 	add	v16.2s, v16.2s, v17.2s
+// 	str	s16, [x0, 16]
+//
+// 	ret
+// .align 4
+// .K0:
+// 	.word	0x5A827999
+// 	.word	0x5A827999
+// 	.word	0x5A827999
+// 	.word	0x5A827999
+// .K1:
+// 	.word	0x6ED9EBA1
+// 	.word	0x6ED9EBA1
+// 	.word	0x6ED9EBA1
+// 	.word	0x6ED9EBA1
+// .K2:
+// 	.word	0x8F1BBCDC
+// 	.word	0x8F1BBCDC
+// 	.word	0x8F1BBCDC
+// 	.word	0x8F1BBCDC
+// .K3:
+// 	.word	0xCA62C1D6
+// 	.word	0xCA62C1D6
+// 	.word	0xCA62C1D6
+// 	.word	0xCA62C1D6
diff --git a/sha1/src/asm/mod.rs b/sha1/src/asm/mod.rs
new file mode 100644
index 000000000..164c8065d
--- /dev/null
+++ b/sha1/src/asm/mod.rs
@@ -0,0 +1,25 @@
+// TODO (laudiacay): here, do the switch to figure out which architecture's method we'll do...
+// here's how that md5 PR did it (obviously wrong for what we want here...)
+// #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+// mod x86;
+//
+// #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+// pub use x86::compress;
+
+#[cfg(all(feature = "inline-asm", target_arch = "x86",))]
+mod x86;
+#[cfg(all(feature = "inline-asm", target_arch = "x86",))]
+pub use x86::compress;
+
+#[cfg(all(feature = "inline-asm", target_arch = "x86_64",))]
+mod x86_64;
+#[cfg(all(feature = "inline-asm", target_arch = "x86_64",))]
+pub use x86_64::compress;
+
+#[cfg(all(feature = "inline-asm", target_arch = "aarch64",))]
+mod aarch64;
+#[cfg(all(feature = "inline-asm", target_arch = "aarch64",))]
+pub use aarch64::compress;
+
+// TODO(laudiacay) i don't know how to detect M1
+mod aarch64_apple;
diff --git a/sha1/src/asm/x86.rs b/sha1/src/asm/x86.rs
new file mode 100644
index 000000000..f7b24e839
--- /dev/null
+++ b/sha1/src/asm/x86.rs
@@ -0,0 +1,343 @@
+//! SHA-1 hash in x86 assembly. adapted from Project Nayuki's MIT licensed code...
+// /*
+//  * SHA-1 hash in x86 assembly
+//  *
+//  * Copyright (c) 2014 Project Nayuki. (MIT License)
+//  * https://www.nayuki.io/page/fast-sha1-hash-implementation-in-x86-assembly
+//  *
+//  * Permission is hereby granted, free of charge, to any person obtaining a copy of
+//  * this software and associated documentation files (the "Software"), to deal in
+//  * the Software without restriction, including without limitation the rights to
+//  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+//  * the Software, and to permit persons to whom the Software is furnished to do so,
+//  * subject to the following conditions:
+//  * - The above copyright notice and this permission notice shall be included in
+//  *   all copies or substantial portions of the Software.
+//  * - The Software is provided "as is", without warranty of any kind, express or
+//  *   implied, including but not limited to the warranties of merchantability,
+//  *   fitness for a particular purpose and noninfringement. In no event shall the
+//  *   authors or copyright holders be liable for any claim, damages or other
+//  *   liability, whether in an action of contract, tort or otherwise, arising from,
+//  *   out of or in connection with the Software or the use or other dealings in the
+//  *   Software.
+//  */
+use core::arch::asm;
+
+use asm_block::asm_block;
+
+//
+// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */
+// #ifdef __APPLE__
+// .globl _sha1_compress
+// _sha1_compress:
+// #else
+// .globl sha1_compress
+// sha1_compress:
+// #endif
+//     /*
+//      * Storage usage:
+//      *   Bytes  Location  Description
+//      *       4  eax       SHA-1 state variable A
+//      *       4  ebx       SHA-1 state variable B
+//      *       4  ecx       SHA-1 state variable C
+//      *       4  edx       SHA-1 state variable D
+//      *       4  ebp       SHA-1 state variable E
+//      *       4  esi       Temporary for calculation per round
+//      *       4  edi       (First 16 rounds) base address of block array argument (read-only); (last 64 rounds) temporary for calculation per round
+//      *       4  esp       x86 stack pointer
+//      *      64  [esp+ 0]  Circular buffer of most recent 16 key schedule items, 4 bytes each
+//      *       4  [esp+64]  Caller's value of ebx
+//      *       4  [esp+68]  Caller's value of esi
+//      *       4  [esp+72]  Caller's value of edi
+//      *       4  [esp+76]  Caller's value of ebp
+//      */
+
+
+//     #define round0a(a, b, c, d, e, i)  \
+//         movl    (i*4)(%edi), %esi;  \
+//         bswapl  %esi;               \
+//         movl    %esi, (i*4)(%esp);  \
+//         addl    %esi, %e;           \
+//         movl    %c, %esi;           \
+//         xorl    %d, %esi;           \
+//         andl    %b, %esi;           \
+//         xorl    %d, %esi;           \
+//         ROUNDTAIL(a, b, e, i, 0x5A827999)
+
+macro_rules! round0a {
+    ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => {
+        concat! {
+            asm_block! {
+                movl    ($i*4)(%edi), %esi;
+                bswapl  %esi;
+                movl    %esi, ($i*4)(%esp);
+                addl    %esi, $e;
+                movl    $c, %esi;
+                xorl    $d, %esi;
+                andl    $b, %esi;
+                xorl    $d, %esi;
+                ROUNDTAIL!($a, $b, $e, $i, 0x5A827999);
+            }
+        }
+    };
+}
+
+//     #define SCHEDULE(i, e)  \
+//         movl  (((i- 3)&0xF)*4)(%esp), %esi;  \
+//         xorl  (((i- 8)&0xF)*4)(%esp), %esi;  \
+//         xorl  (((i-14)&0xF)*4)(%esp), %esi;  \
+//         xorl  (((i-16)&0xF)*4)(%esp), %esi;  \
+//         roll  $1, %esi;                      \
+//         addl  %esi, %e;                      \
+//         movl  %esi, ((i&0xF)*4)(%esp);
+
+macro_rules! schedule {
+    ($i:tt, $e:tt) => {
+        concat!{
+            asm_block! {
+                movl  ((($i- 3)&0xF)*4)(%esp), %esi;
+                xorl  ((($i- 8)&0xF)*4)(%esp), %esi;
+                xorl  ((($i-14)&0xF)*4)(%esp), %esi;
+                xorl  ((($i-16)&0xF)*4)(%esp), %esi;
+                roll  $1, %esi;
+                addl  %esi, $e;
+                movl  %esi, (($i&0xF)*4)(%esp);
+            }
+        }
+    };
+}
+
+//     #define ROUND0b(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         andl  %b, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0x5A827999)
+
+macro_rules! round0b {
+    ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => {
+        concat! {
+            asm_block! {
+                schedule!($i, $e);
+                movl  $c, %esi;
+                xorl  $d, %esi;
+                andl  $b, %esi;
+                xorl  $d, %esi;
+                roundtail!($a, $b, $e, $i, 0x5A827999);
+            }
+        }
+    };
+}
+
+//     #define ROUND1(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %b, %esi;  \
+//         xorl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0x6ED9EBA1)
+
+macro_rules! round1 {
+    ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => {
+        concat! {
+            asm_block! {
+                schedule!($i, $e);
+                movl  $b, %esi;
+                xorl  $c, %esi;
+                xorl  $d, %esi;
+                roundtail!($a, $b, $e, $i, 0x6ED9EBA1);
+            }
+        }
+    };
+}
+
+//     #define ROUND2(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)     \
+//         movl  %c, %esi;    \
+//         movl  %c, %edi;    \
+//         orl   %d, %esi;    \
+//         andl  %b, %esi;    \
+//         andl  %d, %edi;    \
+//         orl   %edi, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0x8F1BBCDC)
+
+macro_rules! round2 {
+    ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => {
+        concat! {
+            asm_block! {
+                schedule!($i, $e);
+                movl  $c, %esi;
+                movl  $c, %edi;
+                orl   $d, %esi;
+                andl  $b, %esi;
+                andl  $d, %edi;
+                orl   %edi, %esi;
+                roundtail!($a, $b, $e, $i, 0x8F1BBCDC);
+            }
+        }
+    };
+}
+
+//     #define ROUND3(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %b, %esi;  \
+//         xorl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0xCA62C1D6)
+
+macro_rules! round3 {
+    ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => {
+        concat! {
+            asm_block! {
+                schedule!($i, $e);
+                movl  $b, %esi;
+                xorl  $c, %esi;
+                xorl  $d, %esi;
+                roundtail!($a, $b, $e, $i, 0xCA62C1D6);
+            }
+        }
+    };
+}
+
+//     #define ROUNDTAIL(a, b, e, i, k)  \
+//         roll  $30, %b;         \
+//         leal  k(%e,%esi), %e;  \
+//         movl  %a, %esi;        \
+//         roll  $5, %esi;        \
+//         addl  %esi, %e;
+
+macro_rules! roundtail {
+    ($a:tt, $b:tt, $e:tt, $i:tt, $k:tt) => {
+        concat! {
+            asm_block! {
+                roll  $30, $b;
+                leal  $k($e,%esi), $e;
+                movl  $a, %esi;
+                roll  $5, %esi;
+                addl  %esi, $e;
+            }
+        }
+    };
+}
+
+macro_rules! asm_sha1 {
+    // states
+    //     /* Save registers */
+//     subl    $80, %esp
+//     movl    %ebx, 64(%esp)
+//     movl    %esi, 68(%esp)
+//     movl    %edi, 72(%esp)
+//     movl    %ebp, 76(%esp)
+//
+//     /* Load arguments */
+//     movl    84(%esp), %esi  /* state */
+//     movl    88(%esp), %edi  /* block */
+//     movl     0(%esi), %eax  /* a */
+//     movl     4(%esi), %ebx  /* b */
+//     movl     8(%esi), %ecx  /* c */
+//     movl    12(%esi), %edx  /* d */
+//     movl    16(%esi), %ebp  /* e */
+//
+//     /* 80 rounds of hashing */
+//     round0a(eax, ebx, ecx, edx, ebp,  0)
+//     round0a(ebp, eax, ebx, ecx, edx,  1)
+//     round0a(edx, ebp, eax, ebx, ecx,  2)
+//     round0a(ecx, edx, ebp, eax, ebx,  3)
+//     round0a(ebx, ecx, edx, ebp, eax,  4)
+//     round0a(eax, ebx, ecx, edx, ebp,  5)
+//     round0a(ebp, eax, ebx, ecx, edx,  6)
+//     round0a(edx, ebp, eax, ebx, ecx,  7)
+//     round0a(ecx, edx, ebp, eax, ebx,  8)
+//     round0a(ebx, ecx, edx, ebp, eax,  9)
+//     round0a(eax, ebx, ecx, edx, ebp, 10)
+//     round0a(ebp, eax, ebx, ecx, edx, 11)
+//     round0a(edx, ebp, eax, ebx, ecx, 12)
+//     round0a(ecx, edx, ebp, eax, ebx, 13)
+//     round0a(ebx, ecx, edx, ebp, eax, 14)
+//     round0a(eax, ebx, ecx, edx, ebp, 15)
+//     ROUND0b(ebp, eax, ebx, ecx, edx, 16)
+//     ROUND0b(edx, ebp, eax, ebx, ecx, 17)
+//     ROUND0b(ecx, edx, ebp, eax, ebx, 18)
+//     ROUND0b(ebx, ecx, edx, ebp, eax, 19)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 20)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 21)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 22)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 23)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 24)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 25)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 26)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 27)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 28)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 29)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 30)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 31)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 32)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 33)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 34)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 35)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 36)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 37)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 38)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 39)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 40)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 41)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 42)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 43)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 44)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 45)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 46)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 47)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 48)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 49)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 50)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 51)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 52)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 53)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 54)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 55)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 56)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 57)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 58)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 59)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 60)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 61)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 62)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 63)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 64)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 65)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 66)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 67)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 68)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 69)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 70)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 71)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 72)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 73)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 74)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 75)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 76)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 77)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 78)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 79)
+//
+//     /* Save updated state */
+//     movl    84(%esp), %esi
+//     addl    %eax,  0(%esi)
+//     addl    %ebx,  4(%esi)
+//     addl    %ecx,  8(%esi)
+//     addl    %edx, 12(%esi)
+//     addl    %ebp, 16(%esi)
+//
+//     /* Restore registers */
+//     movl    64(%esp), %ebx
+//     movl    68(%esp), %esi
+//     movl    72(%esp), %edi
+//     movl    76(%esp), %ebp
+//     addl    $80, %esp
+//     retl
+}
+
+#[cfg(all(feature = "inline_asm", target_arch = "x86"))]
+pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
+   unimplemented!("compress() is not implemented for x86");
+}
diff --git a/sha1/src/asm/x86_64.rs b/sha1/src/asm/x86_64.rs
new file mode 100644
index 000000000..37c26dd15
--- /dev/null
+++ b/sha1/src/asm/x86_64.rs
@@ -0,0 +1,217 @@
+// /*
+//  * SHA-1 hash in x86-64 assembly
+//  *
+//  * Copyright (c) 2015 Project Nayuki. (MIT License)
+//  * https://www.nayuki.io/page/fast-sha1-hash-implementation-in-x86-assembly
+//  *
+//  * Permission is hereby granted, free of charge, to any person obtaining a copy of
+//  * this software and associated documentation files (the "Software"), to deal in
+//  * the Software without restriction, including without limitation the rights to
+//  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+//  * the Software, and to permit persons to whom the Software is furnished to do so,
+//  * subject to the following conditions:
+//  * - The above copyright notice and this permission notice shall be included in
+//  *   all copies or substantial portions of the Software.
+//  * - The Software is provided "as is", without warranty of any kind, express or
+//  *   implied, including but not limited to the warranties of merchantability,
+//  *   fitness for a particular purpose and noninfringement. In no event shall the
+//  *   authors or copyright holders be liable for any claim, damages or other
+//  *   liability, whether in an action of contract, tort or otherwise, arising from,
+//  *   out of or in connection with the Software or the use or other dealings in the
+//  *   Software.
+//  */
+//
+//
+// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */
+// #ifdef __APPLE__
+// .globl _sha1_compress
+// _sha1_compress:
+// #else
+// .globl sha1_compress
+// sha1_compress:
+// #endif
+//     /*
+//      * Storage usage:
+//      *   Bytes  Location  Description
+//      *       4  eax       SHA-1 state variable A
+//      *       4  ebx       SHA-1 state variable B
+//      *       4  ecx       SHA-1 state variable C
+//      *       4  edx       SHA-1 state variable D
+//      *       4  ebp       SHA-1 state variable E
+//      *       4  esi       Temporary for calculation per round
+//      *       4  edi       (Last 64 rounds) temporary for calculation per round
+//      *       8  rdi       (First 16 rounds) base address of block array argument (read-only)
+//      *       8  r8        Base address of state array argument (read-only)
+//      *       8  rsp       x86-64 stack pointer
+//      *      64  [rsp+0]   Circular buffer of most recent 16 key schedule items, 4 bytes each
+//      *      16  xmm0      Caller's value of rbx (only low 64 bits are used)
+//      *      16  xmm1      Caller's value of rbp (only low 64 bits are used)
+//      */
+//
+//     #define round0a(a, b, c, d, e, i)  \
+//         movl    (i*4)(%rdi), %esi;  \
+//         bswapl  %esi;               \
+//         movl    %esi, (i*4)(%rsp);  \
+//         addl    %esi, %e;           \
+//         movl    %c, %esi;           \
+//         xorl    %d, %esi;           \
+//         andl    %b, %esi;           \
+//         xorl    %d, %esi;           \
+//         ROUNDTAIL(a, b, e, i, 0x5A827999)
+//
+//     #define SCHEDULE(i, e)  \
+//         movl  (((i- 3)&0xF)*4)(%rsp), %esi;  \
+//         xorl  (((i- 8)&0xF)*4)(%rsp), %esi;  \
+//         xorl  (((i-14)&0xF)*4)(%rsp), %esi;  \
+//         xorl  (((i-16)&0xF)*4)(%rsp), %esi;  \
+//         roll  $1, %esi;                      \
+//         addl  %esi, %e;                      \
+//         movl  %esi, ((i&0xF)*4)(%rsp);
+//
+//     #define ROUND0b(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         andl  %b, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0x5A827999)
+//
+//     #define ROUND1(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %b, %esi;  \
+//         xorl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0x6ED9EBA1)
+//
+//     #define ROUND2(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)     \
+//         movl  %c, %esi;    \
+//         movl  %c, %edi;    \
+//         orl   %d, %esi;    \
+//         andl  %b, %esi;    \
+//         andl  %d, %edi;    \
+//         orl   %edi, %esi;  \
+//         ROUNDTAIL(a, b, e, i, -0x70E44324)
+//
+//     #define ROUND3(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %b, %esi;  \
+//         xorl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, -0x359D3E2A)
+//
+//     #define ROUNDTAIL(a, b, e, i, k)  \
+//         roll  $30, %b;         \
+//         leal  k(%e,%esi), %e;  \
+//         movl  %a, %esi;        \
+//         roll  $5, %esi;        \
+//         addl  %esi, %e;
+//
+//     /* Save registers, allocate scratch space */
+//     movq    %rbx, %xmm0
+//     movq    %rbp, %xmm1
+//     subq    $64, %rsp
+//
+//     /* Load arguments */
+//     movq    %rdi, %r8
+//     movl     0(%rdi), %eax  /* a */
+//     movl     4(%rdi), %ebx  /* b */
+//     movl     8(%rdi), %ecx  /* c */
+//     movl    12(%rdi), %edx  /* d */
+//     movl    16(%rdi), %ebp  /* e */
+//     movq    %rsi, %rdi
+//
+//     /* 80 rounds of hashing */
+//     round0a(eax, ebx, ecx, edx, ebp,  0)
+//     round0a(ebp, eax, ebx, ecx, edx,  1)
+//     round0a(edx, ebp, eax, ebx, ecx,  2)
+//     round0a(ecx, edx, ebp, eax, ebx,  3)
+//     round0a(ebx, ecx, edx, ebp, eax,  4)
+//     round0a(eax, ebx, ecx, edx, ebp,  5)
+//     round0a(ebp, eax, ebx, ecx, edx,  6)
+//     round0a(edx, ebp, eax, ebx, ecx,  7)
+//     round0a(ecx, edx, ebp, eax, ebx,  8)
+//     round0a(ebx, ecx, edx, ebp, eax,  9)
+//     round0a(eax, ebx, ecx, edx, ebp, 10)
+//     round0a(ebp, eax, ebx, ecx, edx, 11)
+//     round0a(edx, ebp, eax, ebx, ecx, 12)
+//     round0a(ecx, edx, ebp, eax, ebx, 13)
+//     round0a(ebx, ecx, edx, ebp, eax, 14)
+//     round0a(eax, ebx, ecx, edx, ebp, 15)
+//     ROUND0b(ebp, eax, ebx, ecx, edx, 16)
+//     ROUND0b(edx, ebp, eax, ebx, ecx, 17)
+//     ROUND0b(ecx, edx, ebp, eax, ebx, 18)
+//     ROUND0b(ebx, ecx, edx, ebp, eax, 19)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 20)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 21)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 22)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 23)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 24)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 25)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 26)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 27)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 28)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 29)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 30)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 31)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 32)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 33)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 34)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 35)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 36)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 37)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 38)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 39)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 40)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 41)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 42)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 43)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 44)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 45)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 46)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 47)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 48)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 49)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 50)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 51)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 52)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 53)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 54)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 55)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 56)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 57)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 58)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 59)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 60)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 61)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 62)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 63)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 64)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 65)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 66)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 67)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 68)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 69)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 70)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 71)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 72)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 73)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 74)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 75)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 76)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 77)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 78)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 79)
+//
+//     /* Save updated state */
+//     addl    %eax,  0(%r8)
+//     addl    %ebx,  4(%r8)
+//     addl    %ecx,  8(%r8)
+//     addl    %edx, 12(%r8)
+//     addl    %ebp, 16(%r8)
+//
+//     /* Restore registers */
+//     movq    %xmm0, %rbx
+//     movq    %xmm1, %rbp
+//     addq    $64, %rsp
+//     retq
diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
index da4a10a98..2e80ee090 100644
--- a/sha1/src/compress.rs
+++ b/sha1/src/compress.rs
@@ -5,6 +5,8 @@ cfg_if::cfg_if! {
     if #[cfg(feature = "force-soft")] {
         mod soft;
         use soft::compress as compress_inner;
+    } else if #[cfg(feature = "inline-asm")] {
+        use crate::asm::compress as compress_inner;
     } else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] {
         mod soft;
         mod aarch64;
diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs
index 38ddc4b51..8a003d2d9 100644
--- a/sha1/src/lib.rs
+++ b/sha1/src/lib.rs
@@ -63,10 +63,17 @@ use digest::{
     HashMarker, Output,
 };
 
+#[cfg(all(
+    feature = "inline-asm",
+    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+))]
+mod asm;
+
 mod compress;
 
 #[cfg(feature = "compress")]
 pub use compress::compress;
+
 #[cfg(not(feature = "compress"))]
 use compress::compress;