Verify Intel intrinsics against upstream definitions

This commit adds a new crate for testing that the intrinsics listed in this crate do indeed match the upstream definition of each intrinsic. A pre-downloaded XML description of all Intel intrinsics is checked in which is then parsed in the `stdsimd-verify` crate to verify that everything we write down is matched against the upstream definitions. Currently the checks are pretty loose to get this compiling but a few intrinsics were fixed as a result of this. For example: * `_mm256_extract_epi8` - AVX2 intrinsic erroneously listed under AVX * `_mm256_extract_epi16` - AVX2 intrinsic erroneously listed under AVX * `_mm256_extract_epi32` - AVX2 intrinsic erroneously listed under AVX * `_mm256_extract_epi64` - AVX2 intrinsic erroneously listed under AVX * `_mm_tzcnt_32` - erroneously had `u32` in the name * `_mm_tzcnt_64` - erroneously had `u64` in the name * `_mm_cvtsi64_si128` - erroneously available on 32-bit platforms * `_mm_cvtsi64x_si128` - erroneously available on 32-bit platforms * `_mm_cvtsi128_si64` - erroneously available on 32-bit platforms * `_mm_cvtsi128_si64x` - erroneously available on 32-bit platforms * `_mm_extract_epi64` - erroneously available on 32-bit platforms * `_mm_insert_epi64` - erroneously available on 32-bit platforms * `_mm256_extract_epi16` - erroneously returned i32 instead of i16 * `_mm256_extract_epi8` - erroneously returned i32 instead of i8 * `_mm_shuffle_ps` - the mask argument was erroneously i32 instead of u32 * `_popcnt32` - the signededness of the argument and return were flipped * `_popcnt64` - the signededness of the argument was flipped and the argument was too large bit-wise * `_mm_tzcnt_32` - the return value's sign was flipped * `_mm_tzcnt_64` - the return value's sign was flipped * A good number of intrinsics used `imm8: i8` or `imm8: u8` instead of `imm8: i32` which Intel was using. (we were also internally inconsistent) * A number of intrinsics working with `__m64` were instead working with i64/u64, so they're now corrected to operate with the vector types instead. Currently the verifications performed are: * Each name in Rust is defined in the XML document * The arguments/return values all agree. * The CPUID features listed in the XML document are all enabled in Rust as well. The type matching right now is pretty loose and has a lot of questionable changes. Future commits will touch these up to be more strict and require closer adherence with Intel's own types. Otherwise types like `i32x8` (or any integers with 256 bits) all match up to `__m256i` right now, althoguh this may want to change in the future. Finally we're also not testing the instruction listed in the XML right now. There's a huge number of discrepancies between the instruction listed in the XML and the instruction listed in `assert_instr`, and those'll need to be taken care of in a future commit. Closes rust-lang#240
alexcrichton · Dec 29, 2017 · 13e3af1 · 13e3af1
1 parent 2c4d880
commit 13e3af1
Show file tree

Hide file tree

Showing 29 changed files with 135,876 additions and 366 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -24,6 +24,8 @@ matrix:
     - env: DOCUMENTATION
       install: true
       script: ci/dox.sh
+    - script: cargo test --manifest-path stdsimd-verify/Cargo.toml
+      install: true
     - env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
       script: |
         cargo install rustfmt-nightly --force
@@ -40,6 +42,8 @@ install:
 
 script:
   - cargo generate-lockfile
+  # FIXME (travis-ci/travis-ci#8920) shouldn't be necessary...
+  - python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)"
   - ci/run-docker.sh $TARGET $FEATURES
 
 notifications:

diff --git a/Cargo.toml b/Cargo.toml
@@ -12,6 +12,7 @@ categories = ["hardware-support"]
 license = "MIT/Apache-2.0"
 
 [workspace]
+members = ["stdsimd-verify"]
 
 [badges]
 travis-ci = { repository = "BurntSushi/stdsimd" }

diff --git a/ci/run.sh b/ci/run.sh
@@ -22,7 +22,9 @@ echo "FEATURES=${FEATURES}"
 echo "OBJDUMP=${OBJDUMP}"
 
 cargo_test() {
-    cmd="cargo test --all --target=$TARGET --features $FEATURES --verbose $1 -- --nocapture $2"
+    cmd="cargo test --target=$TARGET --features $FEATURES $1"
+    cmd="$cmd -p coresimd -p stdsimd"
+    cmd="$cmd -- $2"
     $cmd
 }
 

diff --git a/coresimd/src/x86/i586/abm.rs b/coresimd/src/x86/i586/abm.rs
@@ -44,16 +44,16 @@ pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+popcnt"]
 #[cfg_attr(test, assert_instr(popcnt))]
-pub unsafe fn _popcnt32(x: u32) -> u32 {
-    x.count_ones()
+pub unsafe fn _popcnt32(x: i32) -> i32 {
+    x.count_ones() as i32
 }
 
 /// Counts the bits that are set.
 #[inline(always)]
 #[target_feature = "+popcnt"]
 #[cfg_attr(test, assert_instr(popcnt))]
-pub unsafe fn _popcnt64(x: u64) -> u64 {
-    x.count_ones() as u64
+pub unsafe fn _popcnt64(x: i64) -> i32 {
+    x.count_ones() as i32
 }
 
 #[cfg(test)]
@@ -64,21 +64,21 @@ mod tests {
 
     #[simd_test = "lzcnt"]
     unsafe fn _lzcnt_u32() {
-        assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
+        assert_eq!(abm::_lzcnt_u32(0b0101_1010), 25);
     }
 
     #[simd_test = "lzcnt"]
     unsafe fn _lzcnt_u64() {
-        assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
+        assert_eq!(abm::_lzcnt_u64(0b0101_1010), 57);
     }
 
     #[simd_test = "popcnt"]
     unsafe fn _popcnt32() {
-        assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
+        assert_eq!(abm::_popcnt32(0b0101_1010), 4);
     }
 
     #[simd_test = "popcnt"]
     unsafe fn _popcnt64() {
-        assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
+        assert_eq!(abm::_popcnt64(0b0101_1010), 4);
     }
 }
diff --git a/coresimd/src/x86/i586/avx.rs b/coresimd/src/x86/i586/avx.rs
@@ -607,77 +607,77 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
 }
 
 /// Equal (ordered, non-signaling)
-pub const _CMP_EQ_OQ: u8 = 0x00;
+pub const _CMP_EQ_OQ: i32 = 0x00;
 /// Less-than (ordered, signaling)
-pub const _CMP_LT_OS: u8 = 0x01;
+pub const _CMP_LT_OS: i32 = 0x01;
 /// Less-than-or-equal (ordered, signaling)
-pub const _CMP_LE_OS: u8 = 0x02;
+pub const _CMP_LE_OS: i32 = 0x02;
 /// Unordered (non-signaling)
-pub const _CMP_UNORD_Q: u8 = 0x03;
+pub const _CMP_UNORD_Q: i32 = 0x03;
 /// Not-equal (unordered, non-signaling)
-pub const _CMP_NEQ_UQ: u8 = 0x04;
+pub const _CMP_NEQ_UQ: i32 = 0x04;
 /// Not-less-than (unordered, signaling)
-pub const _CMP_NLT_US: u8 = 0x05;
+pub const _CMP_NLT_US: i32 = 0x05;
 /// Not-less-than-or-equal (unordered, signaling)
-pub const _CMP_NLE_US: u8 = 0x06;
+pub const _CMP_NLE_US: i32 = 0x06;
 /// Ordered (non-signaling)
-pub const _CMP_ORD_Q: u8 = 0x07;
+pub const _CMP_ORD_Q: i32 = 0x07;
 /// Equal (unordered, non-signaling)
-pub const _CMP_EQ_UQ: u8 = 0x08;
+pub const _CMP_EQ_UQ: i32 = 0x08;
 /// Not-greater-than-or-equal (unordered, signaling)
-pub const _CMP_NGE_US: u8 = 0x09;
+pub const _CMP_NGE_US: i32 = 0x09;
 /// Not-greater-than (unordered, signaling)
-pub const _CMP_NGT_US: u8 = 0x0a;
+pub const _CMP_NGT_US: i32 = 0x0a;
 /// False (ordered, non-signaling)
-pub const _CMP_FALSE_OQ: u8 = 0x0b;
+pub const _CMP_FALSE_OQ: i32 = 0x0b;
 /// Not-equal (ordered, non-signaling)
-pub const _CMP_NEQ_OQ: u8 = 0x0c;
+pub const _CMP_NEQ_OQ: i32 = 0x0c;
 /// Greater-than-or-equal (ordered, signaling)
-pub const _CMP_GE_OS: u8 = 0x0d;
+pub const _CMP_GE_OS: i32 = 0x0d;
 /// Greater-than (ordered, signaling)
-pub const _CMP_GT_OS: u8 = 0x0e;
+pub const _CMP_GT_OS: i32 = 0x0e;
 /// True (unordered, non-signaling)
-pub const _CMP_TRUE_UQ: u8 = 0x0f;
+pub const _CMP_TRUE_UQ: i32 = 0x0f;
 /// Equal (ordered, signaling)
-pub const _CMP_EQ_OS: u8 = 0x10;
+pub const _CMP_EQ_OS: i32 = 0x10;
 /// Less-than (ordered, non-signaling)
-pub const _CMP_LT_OQ: u8 = 0x11;
+pub const _CMP_LT_OQ: i32 = 0x11;
 /// Less-than-or-equal (ordered, non-signaling)
-pub const _CMP_LE_OQ: u8 = 0x12;
+pub const _CMP_LE_OQ: i32 = 0x12;
 /// Unordered (signaling)
-pub const _CMP_UNORD_S: u8 = 0x13;
+pub const _CMP_UNORD_S: i32 = 0x13;
 /// Not-equal (unordered, signaling)
-pub const _CMP_NEQ_US: u8 = 0x14;
+pub const _CMP_NEQ_US: i32 = 0x14;
 /// Not-less-than (unordered, non-signaling)
-pub const _CMP_NLT_UQ: u8 = 0x15;
+pub const _CMP_NLT_UQ: i32 = 0x15;
 /// Not-less-than-or-equal (unordered, non-signaling)
-pub const _CMP_NLE_UQ: u8 = 0x16;
+pub const _CMP_NLE_UQ: i32 = 0x16;
 /// Ordered (signaling)
-pub const _CMP_ORD_S: u8 = 0x17;
+pub const _CMP_ORD_S: i32 = 0x17;
 /// Equal (unordered, signaling)
-pub const _CMP_EQ_US: u8 = 0x18;
+pub const _CMP_EQ_US: i32 = 0x18;
 /// Not-greater-than-or-equal (unordered, non-signaling)
-pub const _CMP_NGE_UQ: u8 = 0x19;
+pub const _CMP_NGE_UQ: i32 = 0x19;
 /// Not-greater-than (unordered, non-signaling)
-pub const _CMP_NGT_UQ: u8 = 0x1a;
+pub const _CMP_NGT_UQ: i32 = 0x1a;
 /// False (ordered, signaling)
-pub const _CMP_FALSE_OS: u8 = 0x1b;
+pub const _CMP_FALSE_OS: i32 = 0x1b;
 /// Not-equal (ordered, signaling)
-pub const _CMP_NEQ_OS: u8 = 0x1c;
+pub const _CMP_NEQ_OS: i32 = 0x1c;
 /// Greater-than-or-equal (ordered, non-signaling)
-pub const _CMP_GE_OQ: u8 = 0x1d;
+pub const _CMP_GE_OQ: i32 = 0x1d;
 /// Greater-than (ordered, non-signaling)
-pub const _CMP_GT_OQ: u8 = 0x1e;
+pub const _CMP_GT_OQ: i32 = 0x1e;
 /// True (unordered, signaling)
-pub const _CMP_TRUE_US: u8 = 0x1f;
+pub const _CMP_TRUE_US: i32 = 0x1f;
 
 /// Compare packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
 #[inline(always)]
 #[target_feature = "+avx,+sse2"]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
-pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
+pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => { vcmppd(a, b, $imm8) }
     }
@@ -690,7 +690,7 @@ pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
-pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
+pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => { vcmppd256(a, b, $imm8) }
     }
@@ -703,7 +703,7 @@ pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx,+sse"]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
-pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => { vcmpps(a, b, $imm8) }
     }
@@ -716,7 +716,7 @@ pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
-pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
+pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
     macro_rules! call {
         ($imm8:expr) => { vcmpps256(a, b, $imm8) }
     }
@@ -731,7 +731,7 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx,+sse2"]
 #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
-pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
+pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => { vcmpsd(a, b, $imm8) }
     }
@@ -746,7 +746,7 @@ pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+avx,+sse"]
 #[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
-pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => { vcmpss(a, b, $imm8) }
     }
@@ -862,48 +862,6 @@ pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
     __m128i::from(dst)
 }
 
-/// Extract an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
-/// integer containing the zero-extended integer data.
-///
-/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
-#[inline(always)]
-#[target_feature = "+avx"]
-// This intrinsic has no corresponding instruction.
-pub unsafe fn _mm256_extract_epi8(a: i8x32, imm8: i32) -> i32 {
-    let imm8 = (imm8 & 31) as u32;
-    (a.extract_unchecked(imm8) as i32) & 0xFF
-}
-
-/// Extract a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
-/// integer containing the zero-extended integer data.
-///
-/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
-#[inline(always)]
-#[target_feature = "+avx"]
-// This intrinsic has no corresponding instruction.
-pub unsafe fn _mm256_extract_epi16(a: i16x16, imm8: i32) -> i32 {
-    let imm8 = (imm8 & 15) as u32;
-    (a.extract_unchecked(imm8) as i32) & 0xFFFF
-}
-
-/// Extract a 32-bit integer from `a`, selected with `imm8`.
-#[inline(always)]
-#[target_feature = "+avx"]
-// This intrinsic has no corresponding instruction.
-pub unsafe fn _mm256_extract_epi32(a: i32x8, imm8: i32) -> i32 {
-    let imm8 = (imm8 & 7) as u32;
-    a.extract_unchecked(imm8)
-}
-
-/// Extract a 64-bit integer from `a`, selected with `imm8`.
-#[inline(always)]
-#[target_feature = "+avx"]
-// This intrinsic has no corresponding instruction.
-pub unsafe fn _mm256_extract_epi64(a: i64x4, imm8: i32) -> i64 {
-    let imm8 = (imm8 & 3) as u32;
-    a.extract_unchecked(imm8)
-}
-
 /// Zero the contents of all XMM or YMM registers.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1138,7 +1096,7 @@ pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
-pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
+pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
     macro_rules! call {
         ($imm8:expr) => { vperm2f128ps256(a, b, $imm8) }
     }
@@ -1150,7 +1108,7 @@ pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
-pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
+pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => { vperm2f128pd256(a, b, $imm8) }
     }
@@ -1163,7 +1121,7 @@ pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
 pub unsafe fn _mm256_permute2f128_si256(
-    a: i32x8, b: i32x8, imm8: i8
+    a: i32x8, b: i32x8, imm8: i32
 ) -> i32x8 {
     macro_rules! call {
         ($imm8:expr) => { vperm2f128si256(a, b, $imm8) }
@@ -3146,47 +3104,6 @@ mod tests {
         assert_eq!(r, __m128i::from(e));
     }
 
-    #[simd_test = "avx"]
-    unsafe fn _mm256_extract_epi8() {
-        #[cfg_attr(rustfmt, rustfmt_skip)]
-        let a = i8x32::new(
-            -1, 1, 2, 3, 4, 5, 6, 7,
-            8, 9, 10, 11, 12, 13, 14, 15,
-            16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31
-        );
-        let r1 = avx::_mm256_extract_epi8(a, 0);
-        let r2 = avx::_mm256_extract_epi8(a, 35);
-        assert_eq!(r1, 0xFF);
-        assert_eq!(r2, 3);
-    }
-
-    #[simd_test = "avx"]
-    unsafe fn _mm256_extract_epi16() {
-        let a =
-            i16x16::new(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r1 = avx::_mm256_extract_epi16(a, 0);
-        let r2 = avx::_mm256_extract_epi16(a, 19);
-        assert_eq!(r1, 0xFFFF);
-        assert_eq!(r2, 3);
-    }
-
-    #[simd_test = "avx"]
-    unsafe fn _mm256_extract_epi32() {
-        let a = i32x8::new(-1, 1, 2, 3, 4, 5, 6, 7);
-        let r1 = avx::_mm256_extract_epi32(a, 0);
-        let r2 = avx::_mm256_extract_epi32(a, 11);
-        assert_eq!(r1, -1);
-        assert_eq!(r2, 3);
-    }
-
-    #[simd_test = "avx"]
-    unsafe fn _mm256_extract_epi64() {
-        let a = i64x4::new(0, 1, 2, 3);
-        let r = avx::_mm256_extract_epi64(a, 3);
-        assert_eq!(r, 3);
-    }
-
     #[simd_test = "avx"]
     unsafe fn _mm256_zeroall() {
         avx::_mm256_zeroall();