diff --git a/.github/workflows/sha1.yml b/.github/workflows/sha1.yml index ca09fd895..2eb5c4312 100644 --- a/.github/workflows/sha1.yml +++ b/.github/workflows/sha1.yml @@ -187,3 +187,26 @@ jobs: override: true - run: cargo test --no-default-features - run: cargo test + + # TODO: merge with test on MSRV bump to 1.59 or higher + # TODO: do i need to think about no-std platforms here? + test-inline-asm: + runs-on: ubuntu-latest + strategy: + matrix: + target: + - aarch64-unknown-linux-gnu + - x86_64-unknown-linux-gnu + - x86-unknown-linux-gnu + # TODO - aarch64-apple-darwin + rust: + - 1.59.0 # MSRV + steps: + - uses: actions/checkout@v3 + - uses: RustCrypto/actions/cargo-cache@master + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: ${{ matrix.rust }} + override: true + - run: cargo test --features inline-asm diff --git a/Cargo.lock b/Cargo.lock index d851c17b4..66c3279ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "asm_block" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "466c0990cf15ef0f331f19fdc16fd60229606ab237476c70a66b747fce2911ab" + [[package]] name = "blake2" version = "0.10.6" @@ -204,6 +210,7 @@ dependencies = [ name = "sha1" version = "0.10.5" dependencies = [ + "asm_block", "cfg-if", "cpufeatures", "digest", diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml index 8ff801508..06a7bc44e 100644 --- a/sha1/Cargo.toml +++ b/sha1/Cargo.toml @@ -18,6 +18,7 @@ cfg-if = "1.0" [target.'cfg(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"))'.dependencies] cpufeatures = "0.2" sha1-asm = { version = "0.5", optional = true } +asm_block = { version = "0.1.3", optional = true } [dev-dependencies] digest = { version = "0.10.4", features = ["dev"] } @@ -30,6 +31,7 @@ oid = ["digest/oid"] # Enable OID support. WARNING: Bumps MSRV to 1.57 asm = ["sha1-asm"] # WARNING: this feature SHOULD NOT be enabled by library crates compress = [] # Expose compress function force-soft = [] # Force software implementation +inline-asm = ["asm_block"] # TODO: i don't know why the "do not enable by library crates" warning is in the asm feature, flagging this to ask about it. # WARNING: bumps MSRV to 1.59 [package.metadata.docs.rs] all-features = true diff --git a/sha1/README.md b/sha1/README.md index bd76f0973..d388cc6f1 100644 --- a/sha1/README.md +++ b/sha1/README.md @@ -23,6 +23,8 @@ We provide this crate for legacy interoperability purposes only. Rust **1.41** or higher. +Enabling feature flag `inline-asm` requires Rust **1.59** or higher. + Minimum supported Rust version can be changed in the future, but it will be done with a minor version bump. diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs new file mode 100644 index 000000000..c5fb97351 --- /dev/null +++ b/sha1/src/asm/aarch64.rs @@ -0,0 +1,279 @@ +//! SHA-1 hash in AArch64 assembly, adapted from Emmanuel Gil Peyrot's MIT-licensed implementation +// +// /* +// * SHA-1 hash in AArch64 assembly +// * +// * Copyright (c) 2020 Emmanuel Gil Peyrot . (MIT License) +// * +// * Permission is hereby granted, free of charge, to any person obtaining a copy of +// * this software and associated documentation files (the "Software"), to deal in +// * the Software without restriction, including without limitation the rights to +// * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// * the Software, and to permit persons to whom the Software is furnished to do so, +// * subject to the following conditions: +// * - The above copyright notice and this permission notice shall be included in +// * all copies or substantial portions of the Software. +// * - The Software is provided "as is", without warranty of any kind, express or +// * implied, including but not limited to the warranties of merchantability, +// * fitness for a particular purpose and noninfringement. In no event shall the +// * authors or copyright holders be liable for any claim, damages or other +// * liability, whether in an action of contract, tort or otherwise, arising from, +// * out of or in connection with the Software or the use or other dealings in the +// * Software. +// */ +use core::arch::asm; + +// macro_rules! sha_1_through_4 { +// (F, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => { + +/// SHA1 compress function. We don't have enough registers to load the whole block, +/// so we need to use memory address to refer to the inputs. Due to possible failure +/// of register allocation on `x86`, we explicitly specify registers to use. +#[cfg(all(feature = "inline-asm", target_arch = "aarch64"))] +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + let mut out_state = [0u32; 5]; + // SAFETY: inline-assembly + unsafe { + asm!( + // from original code, some docs :) + // /* + // * Storage usage: + // * Bytes Location Description + // * 4 x0 state argument + // * 4 x1 block argument + // * 16 q0 W0 + // * 16 q1 W1 + // * 16 q2 W2 + // * 16 q3 W3 + // * 16 q4 k + // * 16 q5 Original ABCD + // * 16 q6 ABCD (with s3 being A) + // * 4 s16 E + // * 4 s17 e0 + // * 4 s18 e1 + // * 16 q19 wk + // */ + + // Load state in registers + // original code: + // ldr q5, [x0] + // ldr s16, [x0, 16] + // this now happens at the bottom... + // TODO what is this doing? + // i believe it's copying state[0..4] into v6 (which is also q6) + // confirmed this is the mutable copy of the first 4 words of the state + "mov v6.16b, v5.16b", + + // Load block in registers + // original code: + // ldr q0, [x1] + // ldr q1, [x1, 16] + // ldr q2, [x1, 32] + // ldr q3, [x1, 48] + // this is at the bottom now + + // from original code: TODO: only do that on little endian + // this flips the blocks from little to big endian + "rev32 v0.16b, v0.16b", + "rev32 v1.16b, v1.16b", + "rev32 v2.16b, v2.16b", + "rev32 v3.16b, v3.16b", + + // k for the next five rounds + "adrp x1, .K0", + "ldr q4, [x1, #:lo12:.K0]", + + // 0 + "sha1h s18, s6", + "add v19.4s, v0.4s, v4.4s", + "sha1c q6, s16, v19.4s", + "sha1su0 v0.4s, v1.4s, v2.4s", + + // 1 + "sha1h s17, s6", + "add v19.4s, v1.4s, v4.4s", + "sha1c q6, s18, v19.4s", + "sha1su1 v0.4s, v3.4s", + "sha1su0 v1.4s, v2.4s, v3.4s", + + // 2 + "sha1h s18, s6", + "add v19.4s, v2.4s, v4.4s", + "sha1c q6, s17, v19.4s", + "sha1su1 v1.4s, v0.4s", + "sha1su0 v2.4s, v3.4s, v0.4s", + + // 3 + "sha1h s17, s6", + "add v19.4s, v3.4s, v4.4s", + "sha1c q6, s18, v19.4s", + "sha1su1 v2.4s, v1.4s", + "sha1su0 v3.4s, v0.4s, v1.4s", + + // 4 + "sha1h s18, s6", + "add v19.4s, v0.4s, v4.4s", + "sha1c q6, s17, v19.4s", + "sha1su1 v3.4s, v2.4s", + "sha1su0 v0.4s, v1.4s, v2.4s", + + // k for the next five rounds + "adrp x1, .K1", + "ldr q4, [x1, #:lo12:.K1]", + + // 5 + "sha1h s17, s6", + "add v19.4s, v1.4s, v4.4s", + "sha1p q6, s18, v19.4s", + "sha1su1 v0.4s, v3.4s", + "sha1su0 v1.4s, v2.4s, v3.4s", + + // 6 + "sha1h s18, s6", + "add v19.4s, v2.4s, v4.4s", + "sha1p q6, s17, v19.4s", + "sha1su1 v1.4s, v0.4s", + "sha1su0 v2.4s, v3.4s, v0.4s", + + // 7 + "sha1h s17, s6", + "add v19.4s, v3.4s, v4.4s", + "sha1p q6, s18, v19.4s", + "sha1su1 v2.4s, v1.4s", + "sha1su0 v3.4s, v0.4s, v1.4s", + + // 8 + "sha1h s18, s6", + "add v19.4s, v0.4s, v4.4s", + "sha1p q6, s17, v19.4s", + "sha1su1 v3.4s, v2.4s", + "sha1su0 v0.4s, v1.4s, v2.4s", + + // 9 + "sha1h s17, s6", + "add v19.4s, v1.4s, v4.4s", + "sha1p q6, s18, v19.4s", + "sha1su1 v0.4s, v3.4s", + "sha1su0 v1.4s, v2.4s, v3.4s", + + // k for the next five rounds + "adrp x1, .K2", + "ldr q4, [x1, #:lo12:.K2]", + + // 10 + "sha1h s18, s6", + "add v19.4s, v2.4s, v4.4s", + "sha1m q6, s17, v19.4s", + "sha1su1 v1.4s, v0.4s", + "sha1su0 v2.4s, v3.4s, v0.4s", + + // 11 + "sha1h s17, s6", + "add v19.4s, v3.4s, v4.4s", + "sha1m q6, s18, v19.4s", + "sha1su1 v2.4s, v1.4s", + "sha1su0 v3.4s, v0.4s, v1.4s", + + // 12 + "sha1h s18, s6", + "add v19.4s, v0.4s, v4.4s", + "sha1m q6, s17, v19.4s", + "sha1su1 v3.4s, v2.4s", + "sha1su0 v0.4s, v1.4s, v2.4s", + + // 13 + "sha1h s17, s6", + "add v19.4s, v1.4s, v4.4s", + "sha1m q6, s18, v19.4s", + "sha1su1 v0.4s, v3.4s", + "sha1su0 v1.4s, v2.4s, v3.4s", + + // 14 + "sha1h s18, s6", + "add v19.4s, v2.4s, v4.4s", + "sha1m q6, s17, v19.4s", + "sha1su1 v1.4s, v0.4s", + "sha1su0 v2.4s, v3.4s, v0.4s", + + // k for the next five rounds + "adrp x1, .K3", + "ldr q4, [x1, #:lo12:.K3]", + + // 15 + "sha1h s17, s6", + "add v19.4s, v3.4s, v4.4s", + "sha1p q6, s18, v19.4s", + "sha1su1 v2.4s, v1.4s", + "sha1su0 v3.4s, v0.4s, v1.4s", + + // 16 + "sha1h s18, s6", + "add v19.4s, v0.4s, v4.4s", + "sha1p q6, s17, v19.4s", + "sha1su1 v3.4s, v2.4s", + + // 17 + "sha1h s17, s6", + "add v19.4s, v1.4s, v4.4s", + "sha1p q6, s18, v19.4s", + + // 18 + "sha1h s18, s6", + "add v19.4s, v2.4s, v4.4s", + "sha1p q6, s17, v19.4s", + + // 19 + "sha1h s17, s6", + "add v19.4s, v3.4s, v4.4s", + "sha1p q6, s18, v19.4s", + + // Update state + "add v6.4s, v6.4s, v5.4s", + // source code: str q6, [x0] + // this now happens at the bottom + "add v16.2s, v16.2s, v17.2s", + // source code: str s16, [x0, 16] + // this now happens at the bottom + + "ret", // TODO is this right + + ".align 4", // TODO ummm alignment... + ".K0:", // TODO are labels just the same in inline asm in rust? + ".word 0x5A827999", + ".word 0x5A827999", + ".word 0x5A827999", + ".word 0x5A827999", + ".K1:", + ".word 0x6ED9EBA1", + ".word 0x6ED9EBA1", + ".word 0x6ED9EBA1", + ".word 0x6ED9EBA1", + ".K2:", + ".word 0x8F1BBCDC", + ".word 0x8F1BBCDC", + ".word 0x8F1BBCDC", + ".word 0x8F1BBCDC", + ".K3:", + ".word 0xCA62C1D6", + ".word 0xCA62C1D6", + ".word 0xCA62C1D6", + ".word 0xCA62C1D6", + + // state ins and outs + in("q4") state.as_mut_ptr(), + inout("s16") state[4], + lateout("q6") state as *mut u32, + // blocks in + in("q0") blocks[0][0..16].as_ptr(), + in("q1") blocks[0][16..32].as_ptr(), + in("q2") blocks[0][32..48].as_ptr(), + in("q3") blocks[0][48..64].as_ptr(), + // some clobbers + out("q5") _, + out("s17") _, + out("s18") _, + out("q19") _, + // TODO make sure there aren't any other clobbers + ); + }; +} diff --git a/sha1/src/asm/aarch64_apple.rs b/sha1/src/asm/aarch64_apple.rs new file mode 100644 index 000000000..832c2c359 --- /dev/null +++ b/sha1/src/asm/aarch64_apple.rs @@ -0,0 +1,237 @@ +// /* +// * SHA-1 hash in AArch64 assembly +// * +// * Copyright (c) 2020 Emmanuel Gil Peyrot . (MIT License) +// * +// * Permission is hereby granted, free of charge, to any person obtaining a copy of +// * this software and associated documentation files (the "Software"), to deal in +// * the Software without restriction, including without limitation the rights to +// * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// * the Software, and to permit persons to whom the Software is furnished to do so, +// * subject to the following conditions: +// * - The above copyright notice and this permission notice shall be included in +// * all copies or substantial portions of the Software. +// * - The Software is provided "as is", without warranty of any kind, express or +// * implied, including but not limited to the warranties of merchantability, +// * fitness for a particular purpose and noninfringement. In no event shall the +// * authors or copyright holders be liable for any claim, damages or other +// * liability, whether in an action of contract, tort or otherwise, arising from, +// * out of or in connection with the Software or the use or other dealings in the +// * Software. +// */ +// +// +// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */ +// .global _sha1_compress +// _sha1_compress: +// /* +// * Storage usage: +// * Bytes Location Description +// * 4 x0 state argument +// * 4 x1 block argument +// * 16 q0 W0 +// * 16 q1 W1 +// * 16 q2 W2 +// * 16 q3 W3 +// * 16 q4 k +// * 16 q5 Original ABCD +// * 16 q6 ABCD (with s3 being A) +// * 4 s16 E +// * 4 s17 e0 +// * 4 s18 e1 +// * 16 q19 wk +// */ +// +// // Load state in registers +// ldr q5, [x0] +// ldr s16, [x0, 16] +// mov v6.16b, v5.16b +// +// // Load block in registers +// ldr q0, [x1] +// ldr q1, [x1, 16] +// ldr q2, [x1, 32] +// ldr q3, [x1, 48] +// +// // TODO: only do that on little endian +// rev32 v0.16b, v0.16b +// rev32 v1.16b, v1.16b +// rev32 v2.16b, v2.16b +// rev32 v3.16b, v3.16b +// +// // k for the next five rounds +// adrp x1, .K0@PAGE +// ldr q4, [x1, #:lo12:.K0@PAGEOFF] +// +// // 0 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1c q6, s16, v19.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // 1 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1c q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // 2 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1c q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // 3 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1c q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 4 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1c q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // k for the next five rounds +// adrp x1, .K1@PAGE +// ldr q4, [x1, #:lo12:.K1@PAGEOFF] +// +// // 5 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // 6 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1p q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // 7 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 8 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1p q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // 9 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // k for the next five rounds +// adrp x1, .K2@PAGE +// ldr q4, [x1, #:lo12:.K2@PAGEOFF] +// +// // 10 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1m q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // 11 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1m q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 12 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1m q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // 13 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1m q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // 14 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1m q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // k for the next five rounds +// adrp x1, .K3@PAGE +// ldr q4, [x1, #:lo12:.K3@PAGEOFF] +// +// // 15 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 16 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1p q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// +// // 17 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1p q6, s18, v19.4s +// +// // 18 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1p q6, s17, v19.4s +// +// // 19 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1p q6, s18, v19.4s +// +// // Update state +// add v6.4s, v6.4s, v5.4s +// str q6, [x0] +// add v16.2s, v16.2s, v17.2s +// str s16, [x0, 16] +// +// ret +// .align 4 +// .K0: +// .word 0x5A827999 +// .word 0x5A827999 +// .word 0x5A827999 +// .word 0x5A827999 +// .K1: +// .word 0x6ED9EBA1 +// .word 0x6ED9EBA1 +// .word 0x6ED9EBA1 +// .word 0x6ED9EBA1 +// .K2: +// .word 0x8F1BBCDC +// .word 0x8F1BBCDC +// .word 0x8F1BBCDC +// .word 0x8F1BBCDC +// .K3: +// .word 0xCA62C1D6 +// .word 0xCA62C1D6 +// .word 0xCA62C1D6 +// .word 0xCA62C1D6 diff --git a/sha1/src/asm/mod.rs b/sha1/src/asm/mod.rs new file mode 100644 index 000000000..164c8065d --- /dev/null +++ b/sha1/src/asm/mod.rs @@ -0,0 +1,25 @@ +// TODO (laudiacay): here, do the switch to figure out which architecture's method we'll do... +// here's how that md5 PR did it (obviously wrong for what we want here...) +// #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +// mod x86; +// +// #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +// pub use x86::compress; + +#[cfg(all(feature = "inline-asm", target_arch = "x86",))] +mod x86; +#[cfg(all(feature = "inline-asm", target_arch = "x86",))] +pub use x86::compress; + +#[cfg(all(feature = "inline-asm", target_arch = "x86_64",))] +mod x86_64; +#[cfg(all(feature = "inline-asm", target_arch = "x86_64",))] +pub use x86_64::compress; + +#[cfg(all(feature = "inline-asm", target_arch = "aarch64",))] +mod aarch64; +#[cfg(all(feature = "inline-asm", target_arch = "aarch64",))] +pub use aarch64::compress; + +// TODO(laudiacay) i don't know how to detect M1 +mod aarch64_apple; diff --git a/sha1/src/asm/x86.rs b/sha1/src/asm/x86.rs new file mode 100644 index 000000000..f7b24e839 --- /dev/null +++ b/sha1/src/asm/x86.rs @@ -0,0 +1,343 @@ +//! SHA-1 hash in x86 assembly. adapted from Project Nayuki's MIT licensed code... +// /* +// * SHA-1 hash in x86 assembly +// * +// * Copyright (c) 2014 Project Nayuki. (MIT License) +// * https://www.nayuki.io/page/fast-sha1-hash-implementation-in-x86-assembly +// * +// * Permission is hereby granted, free of charge, to any person obtaining a copy of +// * this software and associated documentation files (the "Software"), to deal in +// * the Software without restriction, including without limitation the rights to +// * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// * the Software, and to permit persons to whom the Software is furnished to do so, +// * subject to the following conditions: +// * - The above copyright notice and this permission notice shall be included in +// * all copies or substantial portions of the Software. +// * - The Software is provided "as is", without warranty of any kind, express or +// * implied, including but not limited to the warranties of merchantability, +// * fitness for a particular purpose and noninfringement. In no event shall the +// * authors or copyright holders be liable for any claim, damages or other +// * liability, whether in an action of contract, tort or otherwise, arising from, +// * out of or in connection with the Software or the use or other dealings in the +// * Software. +// */ +use core::arch::asm; + +use asm_block::asm_block; + +// +// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */ +// #ifdef __APPLE__ +// .globl _sha1_compress +// _sha1_compress: +// #else +// .globl sha1_compress +// sha1_compress: +// #endif +// /* +// * Storage usage: +// * Bytes Location Description +// * 4 eax SHA-1 state variable A +// * 4 ebx SHA-1 state variable B +// * 4 ecx SHA-1 state variable C +// * 4 edx SHA-1 state variable D +// * 4 ebp SHA-1 state variable E +// * 4 esi Temporary for calculation per round +// * 4 edi (First 16 rounds) base address of block array argument (read-only); (last 64 rounds) temporary for calculation per round +// * 4 esp x86 stack pointer +// * 64 [esp+ 0] Circular buffer of most recent 16 key schedule items, 4 bytes each +// * 4 [esp+64] Caller's value of ebx +// * 4 [esp+68] Caller's value of esi +// * 4 [esp+72] Caller's value of edi +// * 4 [esp+76] Caller's value of ebp +// */ + + +// #define round0a(a, b, c, d, e, i) \ +// movl (i*4)(%edi), %esi; \ +// bswapl %esi; \ +// movl %esi, (i*4)(%esp); \ +// addl %esi, %e; \ +// movl %c, %esi; \ +// xorl %d, %esi; \ +// andl %b, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x5A827999) + +macro_rules! round0a { + ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => { + concat! { + asm_block! { + movl ($i*4)(%edi), %esi; + bswapl %esi; + movl %esi, ($i*4)(%esp); + addl %esi, $e; + movl $c, %esi; + xorl $d, %esi; + andl $b, %esi; + xorl $d, %esi; + ROUNDTAIL!($a, $b, $e, $i, 0x5A827999); + } + } + }; +} + +// #define SCHEDULE(i, e) \ +// movl (((i- 3)&0xF)*4)(%esp), %esi; \ +// xorl (((i- 8)&0xF)*4)(%esp), %esi; \ +// xorl (((i-14)&0xF)*4)(%esp), %esi; \ +// xorl (((i-16)&0xF)*4)(%esp), %esi; \ +// roll $1, %esi; \ +// addl %esi, %e; \ +// movl %esi, ((i&0xF)*4)(%esp); + +macro_rules! schedule { + ($i:tt, $e:tt) => { + concat!{ + asm_block! { + movl ((($i- 3)&0xF)*4)(%esp), %esi; + xorl ((($i- 8)&0xF)*4)(%esp), %esi; + xorl ((($i-14)&0xF)*4)(%esp), %esi; + xorl ((($i-16)&0xF)*4)(%esp), %esi; + roll $1, %esi; + addl %esi, $e; + movl %esi, (($i&0xF)*4)(%esp); + } + } + }; +} + +// #define ROUND0b(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %c, %esi; \ +// xorl %d, %esi; \ +// andl %b, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x5A827999) + +macro_rules! round0b { + ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => { + concat! { + asm_block! { + schedule!($i, $e); + movl $c, %esi; + xorl $d, %esi; + andl $b, %esi; + xorl $d, %esi; + roundtail!($a, $b, $e, $i, 0x5A827999); + } + } + }; +} + +// #define ROUND1(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %b, %esi; \ +// xorl %c, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x6ED9EBA1) + +macro_rules! round1 { + ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => { + concat! { + asm_block! { + schedule!($i, $e); + movl $b, %esi; + xorl $c, %esi; + xorl $d, %esi; + roundtail!($a, $b, $e, $i, 0x6ED9EBA1); + } + } + }; +} + +// #define ROUND2(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %c, %esi; \ +// movl %c, %edi; \ +// orl %d, %esi; \ +// andl %b, %esi; \ +// andl %d, %edi; \ +// orl %edi, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x8F1BBCDC) + +macro_rules! round2 { + ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => { + concat! { + asm_block! { + schedule!($i, $e); + movl $c, %esi; + movl $c, %edi; + orl $d, %esi; + andl $b, %esi; + andl $d, %edi; + orl %edi, %esi; + roundtail!($a, $b, $e, $i, 0x8F1BBCDC); + } + } + }; +} + +// #define ROUND3(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %b, %esi; \ +// xorl %c, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0xCA62C1D6) + +macro_rules! round3 { + ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => { + concat! { + asm_block! { + schedule!($i, $e); + movl $b, %esi; + xorl $c, %esi; + xorl $d, %esi; + roundtail!($a, $b, $e, $i, 0xCA62C1D6); + } + } + }; +} + +// #define ROUNDTAIL(a, b, e, i, k) \ +// roll $30, %b; \ +// leal k(%e,%esi), %e; \ +// movl %a, %esi; \ +// roll $5, %esi; \ +// addl %esi, %e; + +macro_rules! roundtail { + ($a:tt, $b:tt, $e:tt, $i:tt, $k:tt) => { + concat! { + asm_block! { + roll $30, $b; + leal $k($e,%esi), $e; + movl $a, %esi; + roll $5, %esi; + addl %esi, $e; + } + } + }; +} + +macro_rules! asm_sha1 { + // states + // /* Save registers */ +// subl $80, %esp +// movl %ebx, 64(%esp) +// movl %esi, 68(%esp) +// movl %edi, 72(%esp) +// movl %ebp, 76(%esp) +// +// /* Load arguments */ +// movl 84(%esp), %esi /* state */ +// movl 88(%esp), %edi /* block */ +// movl 0(%esi), %eax /* a */ +// movl 4(%esi), %ebx /* b */ +// movl 8(%esi), %ecx /* c */ +// movl 12(%esi), %edx /* d */ +// movl 16(%esi), %ebp /* e */ +// +// /* 80 rounds of hashing */ +// round0a(eax, ebx, ecx, edx, ebp, 0) +// round0a(ebp, eax, ebx, ecx, edx, 1) +// round0a(edx, ebp, eax, ebx, ecx, 2) +// round0a(ecx, edx, ebp, eax, ebx, 3) +// round0a(ebx, ecx, edx, ebp, eax, 4) +// round0a(eax, ebx, ecx, edx, ebp, 5) +// round0a(ebp, eax, ebx, ecx, edx, 6) +// round0a(edx, ebp, eax, ebx, ecx, 7) +// round0a(ecx, edx, ebp, eax, ebx, 8) +// round0a(ebx, ecx, edx, ebp, eax, 9) +// round0a(eax, ebx, ecx, edx, ebp, 10) +// round0a(ebp, eax, ebx, ecx, edx, 11) +// round0a(edx, ebp, eax, ebx, ecx, 12) +// round0a(ecx, edx, ebp, eax, ebx, 13) +// round0a(ebx, ecx, edx, ebp, eax, 14) +// round0a(eax, ebx, ecx, edx, ebp, 15) +// ROUND0b(ebp, eax, ebx, ecx, edx, 16) +// ROUND0b(edx, ebp, eax, ebx, ecx, 17) +// ROUND0b(ecx, edx, ebp, eax, ebx, 18) +// ROUND0b(ebx, ecx, edx, ebp, eax, 19) +// ROUND1(eax, ebx, ecx, edx, ebp, 20) +// ROUND1(ebp, eax, ebx, ecx, edx, 21) +// ROUND1(edx, ebp, eax, ebx, ecx, 22) +// ROUND1(ecx, edx, ebp, eax, ebx, 23) +// ROUND1(ebx, ecx, edx, ebp, eax, 24) +// ROUND1(eax, ebx, ecx, edx, ebp, 25) +// ROUND1(ebp, eax, ebx, ecx, edx, 26) +// ROUND1(edx, ebp, eax, ebx, ecx, 27) +// ROUND1(ecx, edx, ebp, eax, ebx, 28) +// ROUND1(ebx, ecx, edx, ebp, eax, 29) +// ROUND1(eax, ebx, ecx, edx, ebp, 30) +// ROUND1(ebp, eax, ebx, ecx, edx, 31) +// ROUND1(edx, ebp, eax, ebx, ecx, 32) +// ROUND1(ecx, edx, ebp, eax, ebx, 33) +// ROUND1(ebx, ecx, edx, ebp, eax, 34) +// ROUND1(eax, ebx, ecx, edx, ebp, 35) +// ROUND1(ebp, eax, ebx, ecx, edx, 36) +// ROUND1(edx, ebp, eax, ebx, ecx, 37) +// ROUND1(ecx, edx, ebp, eax, ebx, 38) +// ROUND1(ebx, ecx, edx, ebp, eax, 39) +// ROUND2(eax, ebx, ecx, edx, ebp, 40) +// ROUND2(ebp, eax, ebx, ecx, edx, 41) +// ROUND2(edx, ebp, eax, ebx, ecx, 42) +// ROUND2(ecx, edx, ebp, eax, ebx, 43) +// ROUND2(ebx, ecx, edx, ebp, eax, 44) +// ROUND2(eax, ebx, ecx, edx, ebp, 45) +// ROUND2(ebp, eax, ebx, ecx, edx, 46) +// ROUND2(edx, ebp, eax, ebx, ecx, 47) +// ROUND2(ecx, edx, ebp, eax, ebx, 48) +// ROUND2(ebx, ecx, edx, ebp, eax, 49) +// ROUND2(eax, ebx, ecx, edx, ebp, 50) +// ROUND2(ebp, eax, ebx, ecx, edx, 51) +// ROUND2(edx, ebp, eax, ebx, ecx, 52) +// ROUND2(ecx, edx, ebp, eax, ebx, 53) +// ROUND2(ebx, ecx, edx, ebp, eax, 54) +// ROUND2(eax, ebx, ecx, edx, ebp, 55) +// ROUND2(ebp, eax, ebx, ecx, edx, 56) +// ROUND2(edx, ebp, eax, ebx, ecx, 57) +// ROUND2(ecx, edx, ebp, eax, ebx, 58) +// ROUND2(ebx, ecx, edx, ebp, eax, 59) +// ROUND3(eax, ebx, ecx, edx, ebp, 60) +// ROUND3(ebp, eax, ebx, ecx, edx, 61) +// ROUND3(edx, ebp, eax, ebx, ecx, 62) +// ROUND3(ecx, edx, ebp, eax, ebx, 63) +// ROUND3(ebx, ecx, edx, ebp, eax, 64) +// ROUND3(eax, ebx, ecx, edx, ebp, 65) +// ROUND3(ebp, eax, ebx, ecx, edx, 66) +// ROUND3(edx, ebp, eax, ebx, ecx, 67) +// ROUND3(ecx, edx, ebp, eax, ebx, 68) +// ROUND3(ebx, ecx, edx, ebp, eax, 69) +// ROUND3(eax, ebx, ecx, edx, ebp, 70) +// ROUND3(ebp, eax, ebx, ecx, edx, 71) +// ROUND3(edx, ebp, eax, ebx, ecx, 72) +// ROUND3(ecx, edx, ebp, eax, ebx, 73) +// ROUND3(ebx, ecx, edx, ebp, eax, 74) +// ROUND3(eax, ebx, ecx, edx, ebp, 75) +// ROUND3(ebp, eax, ebx, ecx, edx, 76) +// ROUND3(edx, ebp, eax, ebx, ecx, 77) +// ROUND3(ecx, edx, ebp, eax, ebx, 78) +// ROUND3(ebx, ecx, edx, ebp, eax, 79) +// +// /* Save updated state */ +// movl 84(%esp), %esi +// addl %eax, 0(%esi) +// addl %ebx, 4(%esi) +// addl %ecx, 8(%esi) +// addl %edx, 12(%esi) +// addl %ebp, 16(%esi) +// +// /* Restore registers */ +// movl 64(%esp), %ebx +// movl 68(%esp), %esi +// movl 72(%esp), %edi +// movl 76(%esp), %ebp +// addl $80, %esp +// retl +} + +#[cfg(all(feature = "inline_asm", target_arch = "x86"))] +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + unimplemented!("compress() is not implemented for x86"); +} diff --git a/sha1/src/asm/x86_64.rs b/sha1/src/asm/x86_64.rs new file mode 100644 index 000000000..37c26dd15 --- /dev/null +++ b/sha1/src/asm/x86_64.rs @@ -0,0 +1,217 @@ +// /* +// * SHA-1 hash in x86-64 assembly +// * +// * Copyright (c) 2015 Project Nayuki. (MIT License) +// * https://www.nayuki.io/page/fast-sha1-hash-implementation-in-x86-assembly +// * +// * Permission is hereby granted, free of charge, to any person obtaining a copy of +// * this software and associated documentation files (the "Software"), to deal in +// * the Software without restriction, including without limitation the rights to +// * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// * the Software, and to permit persons to whom the Software is furnished to do so, +// * subject to the following conditions: +// * - The above copyright notice and this permission notice shall be included in +// * all copies or substantial portions of the Software. +// * - The Software is provided "as is", without warranty of any kind, express or +// * implied, including but not limited to the warranties of merchantability, +// * fitness for a particular purpose and noninfringement. In no event shall the +// * authors or copyright holders be liable for any claim, damages or other +// * liability, whether in an action of contract, tort or otherwise, arising from, +// * out of or in connection with the Software or the use or other dealings in the +// * Software. +// */ +// +// +// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */ +// #ifdef __APPLE__ +// .globl _sha1_compress +// _sha1_compress: +// #else +// .globl sha1_compress +// sha1_compress: +// #endif +// /* +// * Storage usage: +// * Bytes Location Description +// * 4 eax SHA-1 state variable A +// * 4 ebx SHA-1 state variable B +// * 4 ecx SHA-1 state variable C +// * 4 edx SHA-1 state variable D +// * 4 ebp SHA-1 state variable E +// * 4 esi Temporary for calculation per round +// * 4 edi (Last 64 rounds) temporary for calculation per round +// * 8 rdi (First 16 rounds) base address of block array argument (read-only) +// * 8 r8 Base address of state array argument (read-only) +// * 8 rsp x86-64 stack pointer +// * 64 [rsp+0] Circular buffer of most recent 16 key schedule items, 4 bytes each +// * 16 xmm0 Caller's value of rbx (only low 64 bits are used) +// * 16 xmm1 Caller's value of rbp (only low 64 bits are used) +// */ +// +// #define round0a(a, b, c, d, e, i) \ +// movl (i*4)(%rdi), %esi; \ +// bswapl %esi; \ +// movl %esi, (i*4)(%rsp); \ +// addl %esi, %e; \ +// movl %c, %esi; \ +// xorl %d, %esi; \ +// andl %b, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x5A827999) +// +// #define SCHEDULE(i, e) \ +// movl (((i- 3)&0xF)*4)(%rsp), %esi; \ +// xorl (((i- 8)&0xF)*4)(%rsp), %esi; \ +// xorl (((i-14)&0xF)*4)(%rsp), %esi; \ +// xorl (((i-16)&0xF)*4)(%rsp), %esi; \ +// roll $1, %esi; \ +// addl %esi, %e; \ +// movl %esi, ((i&0xF)*4)(%rsp); +// +// #define ROUND0b(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %c, %esi; \ +// xorl %d, %esi; \ +// andl %b, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x5A827999) +// +// #define ROUND1(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %b, %esi; \ +// xorl %c, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x6ED9EBA1) +// +// #define ROUND2(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %c, %esi; \ +// movl %c, %edi; \ +// orl %d, %esi; \ +// andl %b, %esi; \ +// andl %d, %edi; \ +// orl %edi, %esi; \ +// ROUNDTAIL(a, b, e, i, -0x70E44324) +// +// #define ROUND3(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %b, %esi; \ +// xorl %c, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, -0x359D3E2A) +// +// #define ROUNDTAIL(a, b, e, i, k) \ +// roll $30, %b; \ +// leal k(%e,%esi), %e; \ +// movl %a, %esi; \ +// roll $5, %esi; \ +// addl %esi, %e; +// +// /* Save registers, allocate scratch space */ +// movq %rbx, %xmm0 +// movq %rbp, %xmm1 +// subq $64, %rsp +// +// /* Load arguments */ +// movq %rdi, %r8 +// movl 0(%rdi), %eax /* a */ +// movl 4(%rdi), %ebx /* b */ +// movl 8(%rdi), %ecx /* c */ +// movl 12(%rdi), %edx /* d */ +// movl 16(%rdi), %ebp /* e */ +// movq %rsi, %rdi +// +// /* 80 rounds of hashing */ +// round0a(eax, ebx, ecx, edx, ebp, 0) +// round0a(ebp, eax, ebx, ecx, edx, 1) +// round0a(edx, ebp, eax, ebx, ecx, 2) +// round0a(ecx, edx, ebp, eax, ebx, 3) +// round0a(ebx, ecx, edx, ebp, eax, 4) +// round0a(eax, ebx, ecx, edx, ebp, 5) +// round0a(ebp, eax, ebx, ecx, edx, 6) +// round0a(edx, ebp, eax, ebx, ecx, 7) +// round0a(ecx, edx, ebp, eax, ebx, 8) +// round0a(ebx, ecx, edx, ebp, eax, 9) +// round0a(eax, ebx, ecx, edx, ebp, 10) +// round0a(ebp, eax, ebx, ecx, edx, 11) +// round0a(edx, ebp, eax, ebx, ecx, 12) +// round0a(ecx, edx, ebp, eax, ebx, 13) +// round0a(ebx, ecx, edx, ebp, eax, 14) +// round0a(eax, ebx, ecx, edx, ebp, 15) +// ROUND0b(ebp, eax, ebx, ecx, edx, 16) +// ROUND0b(edx, ebp, eax, ebx, ecx, 17) +// ROUND0b(ecx, edx, ebp, eax, ebx, 18) +// ROUND0b(ebx, ecx, edx, ebp, eax, 19) +// ROUND1(eax, ebx, ecx, edx, ebp, 20) +// ROUND1(ebp, eax, ebx, ecx, edx, 21) +// ROUND1(edx, ebp, eax, ebx, ecx, 22) +// ROUND1(ecx, edx, ebp, eax, ebx, 23) +// ROUND1(ebx, ecx, edx, ebp, eax, 24) +// ROUND1(eax, ebx, ecx, edx, ebp, 25) +// ROUND1(ebp, eax, ebx, ecx, edx, 26) +// ROUND1(edx, ebp, eax, ebx, ecx, 27) +// ROUND1(ecx, edx, ebp, eax, ebx, 28) +// ROUND1(ebx, ecx, edx, ebp, eax, 29) +// ROUND1(eax, ebx, ecx, edx, ebp, 30) +// ROUND1(ebp, eax, ebx, ecx, edx, 31) +// ROUND1(edx, ebp, eax, ebx, ecx, 32) +// ROUND1(ecx, edx, ebp, eax, ebx, 33) +// ROUND1(ebx, ecx, edx, ebp, eax, 34) +// ROUND1(eax, ebx, ecx, edx, ebp, 35) +// ROUND1(ebp, eax, ebx, ecx, edx, 36) +// ROUND1(edx, ebp, eax, ebx, ecx, 37) +// ROUND1(ecx, edx, ebp, eax, ebx, 38) +// ROUND1(ebx, ecx, edx, ebp, eax, 39) +// ROUND2(eax, ebx, ecx, edx, ebp, 40) +// ROUND2(ebp, eax, ebx, ecx, edx, 41) +// ROUND2(edx, ebp, eax, ebx, ecx, 42) +// ROUND2(ecx, edx, ebp, eax, ebx, 43) +// ROUND2(ebx, ecx, edx, ebp, eax, 44) +// ROUND2(eax, ebx, ecx, edx, ebp, 45) +// ROUND2(ebp, eax, ebx, ecx, edx, 46) +// ROUND2(edx, ebp, eax, ebx, ecx, 47) +// ROUND2(ecx, edx, ebp, eax, ebx, 48) +// ROUND2(ebx, ecx, edx, ebp, eax, 49) +// ROUND2(eax, ebx, ecx, edx, ebp, 50) +// ROUND2(ebp, eax, ebx, ecx, edx, 51) +// ROUND2(edx, ebp, eax, ebx, ecx, 52) +// ROUND2(ecx, edx, ebp, eax, ebx, 53) +// ROUND2(ebx, ecx, edx, ebp, eax, 54) +// ROUND2(eax, ebx, ecx, edx, ebp, 55) +// ROUND2(ebp, eax, ebx, ecx, edx, 56) +// ROUND2(edx, ebp, eax, ebx, ecx, 57) +// ROUND2(ecx, edx, ebp, eax, ebx, 58) +// ROUND2(ebx, ecx, edx, ebp, eax, 59) +// ROUND3(eax, ebx, ecx, edx, ebp, 60) +// ROUND3(ebp, eax, ebx, ecx, edx, 61) +// ROUND3(edx, ebp, eax, ebx, ecx, 62) +// ROUND3(ecx, edx, ebp, eax, ebx, 63) +// ROUND3(ebx, ecx, edx, ebp, eax, 64) +// ROUND3(eax, ebx, ecx, edx, ebp, 65) +// ROUND3(ebp, eax, ebx, ecx, edx, 66) +// ROUND3(edx, ebp, eax, ebx, ecx, 67) +// ROUND3(ecx, edx, ebp, eax, ebx, 68) +// ROUND3(ebx, ecx, edx, ebp, eax, 69) +// ROUND3(eax, ebx, ecx, edx, ebp, 70) +// ROUND3(ebp, eax, ebx, ecx, edx, 71) +// ROUND3(edx, ebp, eax, ebx, ecx, 72) +// ROUND3(ecx, edx, ebp, eax, ebx, 73) +// ROUND3(ebx, ecx, edx, ebp, eax, 74) +// ROUND3(eax, ebx, ecx, edx, ebp, 75) +// ROUND3(ebp, eax, ebx, ecx, edx, 76) +// ROUND3(edx, ebp, eax, ebx, ecx, 77) +// ROUND3(ecx, edx, ebp, eax, ebx, 78) +// ROUND3(ebx, ecx, edx, ebp, eax, 79) +// +// /* Save updated state */ +// addl %eax, 0(%r8) +// addl %ebx, 4(%r8) +// addl %ecx, 8(%r8) +// addl %edx, 12(%r8) +// addl %ebp, 16(%r8) +// +// /* Restore registers */ +// movq %xmm0, %rbx +// movq %xmm1, %rbp +// addq $64, %rsp +// retq diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs index da4a10a98..2e80ee090 100644 --- a/sha1/src/compress.rs +++ b/sha1/src/compress.rs @@ -5,6 +5,8 @@ cfg_if::cfg_if! { if #[cfg(feature = "force-soft")] { mod soft; use soft::compress as compress_inner; + } else if #[cfg(feature = "inline-asm")] { + use crate::asm::compress as compress_inner; } else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] { mod soft; mod aarch64; diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs index 38ddc4b51..8a003d2d9 100644 --- a/sha1/src/lib.rs +++ b/sha1/src/lib.rs @@ -63,10 +63,17 @@ use digest::{ HashMarker, Output, }; +#[cfg(all( + feature = "inline-asm", + any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") +))] +mod asm; + mod compress; #[cfg(feature = "compress")] pub use compress::compress; + #[cfg(not(feature = "compress"))] use compress::compress;